acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (51) hide show
  1. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
  2. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataproduct/dataproduct.py +26 -0
  5. datahub/cli/config_utils.py +18 -10
  6. datahub/cli/docker_check.py +2 -1
  7. datahub/cli/docker_cli.py +4 -2
  8. datahub/cli/graphql_cli.py +1422 -0
  9. datahub/cli/quickstart_versioning.py +2 -2
  10. datahub/cli/specific/dataproduct_cli.py +2 -4
  11. datahub/cli/specific/user_cli.py +172 -1
  12. datahub/configuration/env_vars.py +331 -0
  13. datahub/configuration/kafka.py +6 -4
  14. datahub/emitter/mce_builder.py +2 -4
  15. datahub/emitter/rest_emitter.py +15 -15
  16. datahub/entrypoints.py +2 -0
  17. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  18. datahub/ingestion/api/source.py +5 -0
  19. datahub/ingestion/graph/client.py +197 -0
  20. datahub/ingestion/graph/config.py +2 -2
  21. datahub/ingestion/sink/datahub_rest.py +6 -5
  22. datahub/ingestion/source/aws/aws_common.py +20 -13
  23. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
  24. datahub/ingestion/source/grafana/models.py +5 -0
  25. datahub/ingestion/source/iceberg/iceberg.py +39 -19
  26. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
  27. datahub/ingestion/source/mode.py +13 -0
  28. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  29. datahub/ingestion/source/schema_inference/object.py +22 -6
  30. datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
  31. datahub/ingestion/source/sql/mssql/source.py +7 -1
  32. datahub/ingestion/source/sql/teradata.py +80 -65
  33. datahub/ingestion/source/unity/config.py +31 -0
  34. datahub/ingestion/source/unity/proxy.py +73 -0
  35. datahub/ingestion/source/unity/source.py +27 -70
  36. datahub/ingestion/source/unity/usage.py +46 -4
  37. datahub/metadata/_internal_schema_classes.py +544 -544
  38. datahub/metadata/_urns/urn_defs.py +1728 -1728
  39. datahub/metadata/schema.avsc +15157 -15157
  40. datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
  41. datahub/sql_parsing/sqlglot_lineage.py +7 -0
  42. datahub/telemetry/telemetry.py +8 -3
  43. datahub/utilities/file_backed_collections.py +2 -2
  44. datahub/utilities/is_pytest.py +3 -2
  45. datahub/utilities/logging_manager.py +22 -6
  46. datahub/utilities/sample_data.py +5 -4
  47. datahub/emitter/sql_parsing_builder.py +0 -306
  48. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
  49. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
  50. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
  51. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import functools
4
4
  import json
5
5
  import logging
6
- import os
7
6
  import re
8
7
  import time
9
8
  from collections import defaultdict
@@ -33,7 +32,6 @@ from typing_extensions import deprecated
33
32
  from datahub._version import nice_version_name
34
33
  from datahub.cli import config_utils
35
34
  from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
36
- from datahub.cli.env_utils import get_boolean_env_variable
37
35
  from datahub.configuration.common import (
38
36
  ConfigEnum,
39
37
  ConfigModel,
@@ -42,6 +40,14 @@ from datahub.configuration.common import (
42
40
  TraceTimeoutError,
43
41
  TraceValidationError,
44
42
  )
43
+ from datahub.configuration.env_vars import (
44
+ get_emit_mode,
45
+ get_emitter_trace,
46
+ get_rest_emitter_batch_max_payload_bytes,
47
+ get_rest_emitter_batch_max_payload_length,
48
+ get_rest_emitter_default_endpoint,
49
+ get_rest_emitter_default_retry_max_times,
50
+ )
45
51
  from datahub.emitter.generic_emitter import Emitter
46
52
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
47
53
  from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
@@ -82,11 +88,9 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
82
88
  504,
83
89
  ]
84
90
  _DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
85
- _DEFAULT_RETRY_MAX_TIMES = int(
86
- os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
87
- )
91
+ _DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
88
92
 
89
- _DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
93
+ _DATAHUB_EMITTER_TRACE = get_emitter_trace()
90
94
 
91
95
  _DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
92
96
 
@@ -98,17 +102,13 @@ TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
98
102
  # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
99
103
  # for overhead like request headers.
100
104
  # This applies to pretty much all calls to GMS.
101
- INGEST_MAX_PAYLOAD_BYTES = int(
102
- os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_BYTES", 15 * 1024 * 1024)
103
- )
105
+ INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
104
106
 
105
107
  # This limit is somewhat arbitrary. All GMS endpoints will timeout
106
108
  # and return a 500 if processing takes too long. To avoid sending
107
109
  # too much to the backend and hitting a timeout, we try to limit
108
110
  # the number of MCPs we send in a batch.
109
- BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
110
- os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
111
- )
111
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
112
112
 
113
113
 
114
114
  def preserve_unicode_escapes(obj: Any) -> Any:
@@ -147,7 +147,7 @@ class EmitMode(ConfigEnum):
147
147
 
148
148
  _DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
149
149
  EmitMode,
150
- os.getenv("DATAHUB_EMIT_MODE", EmitMode.SYNC_PRIMARY),
150
+ get_emit_mode() or EmitMode.SYNC_PRIMARY,
151
151
  )
152
152
 
153
153
 
@@ -158,7 +158,7 @@ class RestSinkEndpoint(ConfigEnum):
158
158
 
159
159
  DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
160
160
  RestSinkEndpoint,
161
- os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT", RestSinkEndpoint.RESTLI),
161
+ get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
162
162
  )
163
163
 
164
164
 
@@ -478,7 +478,7 @@ class DataHubRestEmitter(Closeable, Emitter):
478
478
  if self._openapi_ingestion is None:
479
479
  # No constructor parameter
480
480
  if (
481
- not os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
481
+ not get_rest_emitter_default_endpoint()
482
482
  and self._session_config.client_mode == ClientMode.SDK
483
483
  and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
484
484
  ):
datahub/entrypoints.py CHANGED
@@ -22,6 +22,7 @@ from datahub.cli.docker_cli import docker
22
22
  from datahub.cli.env_utils import get_boolean_env_variable
23
23
  from datahub.cli.exists_cli import exists
24
24
  from datahub.cli.get_cli import get
25
+ from datahub.cli.graphql_cli import graphql
25
26
  from datahub.cli.ingest_cli import ingest
26
27
  from datahub.cli.migrate import migrate
27
28
  from datahub.cli.put_cli import put
@@ -169,6 +170,7 @@ datahub.add_command(ingest)
169
170
  datahub.add_command(delete)
170
171
  datahub.add_command(exists)
171
172
  datahub.add_command(get)
173
+ datahub.add_command(graphql)
172
174
  datahub.add_command(put)
173
175
  datahub.add_command(state)
174
176
  datahub.add_command(telemetry_cli)
@@ -0,0 +1,87 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Iterable, List
3
+
4
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
5
+ from datahub.metadata.schema_classes import InputFieldClass, InputFieldsClass
6
+
7
+ if TYPE_CHECKING:
8
+ from datahub.ingestion.api.source import SourceReport
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ValidateInputFieldsProcessor:
14
+ def __init__(self, report: "SourceReport"):
15
+ self.report = report
16
+
17
+ def validate_input_fields(
18
+ self,
19
+ stream: Iterable[MetadataWorkUnit],
20
+ ) -> Iterable[MetadataWorkUnit]:
21
+ """
22
+ Validate input fields and filter out invalid ones.
23
+
24
+ Invalid input fields have empty or missing fieldPath values, which would cause
25
+ URN generation to fail when sent to the server. This processor filters them out
26
+ and reports them as warnings.
27
+ """
28
+ for wu in stream:
29
+ input_fields_aspect = wu.get_aspect_of_type(InputFieldsClass)
30
+ if input_fields_aspect and input_fields_aspect.fields:
31
+ valid_fields: List[InputFieldClass] = []
32
+ invalid_count = 0
33
+
34
+ for input_field in input_fields_aspect.fields:
35
+ if (
36
+ input_field.schemaField
37
+ and input_field.schemaField.fieldPath
38
+ and input_field.schemaField.fieldPath.strip()
39
+ ):
40
+ valid_fields.append(input_field)
41
+ else:
42
+ invalid_count += 1
43
+
44
+ if invalid_count > 0:
45
+ logger.debug(
46
+ f"Filtered {invalid_count} invalid input field(s) with empty fieldPath for {wu.get_urn()}"
47
+ )
48
+ self.report.num_input_fields_filtered += invalid_count
49
+ self.report.warning(
50
+ title="Invalid input fields filtered",
51
+ message="Input fields with empty fieldPath values were filtered out to prevent ingestion errors",
52
+ context=f"Filtered {invalid_count} invalid input field(s) for {wu.get_urn()}",
53
+ )
54
+
55
+ # Update the aspect with only valid fields
56
+ if valid_fields:
57
+ input_fields_aspect.fields = valid_fields
58
+ else:
59
+ # If no valid fields remain, skip this workunit entirely
60
+ logger.debug(
61
+ f"All input fields were invalid for {wu.get_urn()}, skipping InputFieldsClass workunit"
62
+ )
63
+ # Don't yield this workunit
64
+ continue
65
+
66
+ yield wu
67
+
68
+ def _remove_input_fields_aspect(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
69
+ """Remove InputFieldsClass aspect from a workunit."""
70
+ # For MCPs, we can simply not yield the aspect
71
+ # For MCEs, we need to remove it from the snapshot
72
+ if hasattr(wu.metadata, "aspect") and isinstance(
73
+ wu.metadata.aspect, InputFieldsClass
74
+ ):
75
+ # This is an MCP with InputFieldsClass, skip it
76
+ return wu
77
+
78
+ if hasattr(wu.metadata, "proposedSnapshot"):
79
+ snapshot = wu.metadata.proposedSnapshot
80
+ if hasattr(snapshot, "aspects"):
81
+ snapshot.aspects = [
82
+ aspect
83
+ for aspect in snapshot.aspects
84
+ if not isinstance(aspect, InputFieldsClass)
85
+ ]
86
+
87
+ return wu
@@ -31,6 +31,9 @@ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import
31
31
  from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
32
32
  EnsureAspectSizeProcessor,
33
33
  )
34
+ from datahub.ingestion.api.auto_work_units.auto_validate_input_fields import (
35
+ ValidateInputFieldsProcessor,
36
+ )
34
37
  from datahub.ingestion.api.closeable import Closeable
35
38
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
36
39
  from datahub.ingestion.api.report import ExamplesReport, Report
@@ -215,6 +218,7 @@ class SourceReport(ExamplesReport, IngestionStageReport):
215
218
  event_not_produced_warn: bool = True
216
219
  events_produced: int = 0
217
220
  events_produced_per_sec: int = 0
221
+ num_input_fields_filtered: int = 0
218
222
 
219
223
  _structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
220
224
 
@@ -543,6 +547,7 @@ class Source(Closeable, metaclass=ABCMeta):
543
547
  browse_path_processor,
544
548
  partial(auto_workunit_reporter, self.get_report()),
545
549
  auto_patch_last_modified,
550
+ ValidateInputFieldsProcessor(self.get_report()).validate_input_fields,
546
551
  EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
547
552
  ]
548
553
 
@@ -30,6 +30,7 @@ from typing_extensions import deprecated
30
30
 
31
31
  from datahub._codegen.aspect import _Aspect
32
32
  from datahub.cli import config_utils
33
+ from datahub.cli.cli_utils import guess_frontend_url_from_gms_url
33
34
  from datahub.configuration.common import ConfigModel, GraphError, OperationalError
34
35
  from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
35
36
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
@@ -2071,6 +2072,202 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
2071
2072
 
2072
2073
  return res["reportAssertionResult"]
2073
2074
 
2075
+ def _get_invite_token(self) -> str:
2076
+ """
2077
+ Retrieve an invite token for user creation.
2078
+
2079
+ Returns:
2080
+ Invite token string
2081
+
2082
+ Raises:
2083
+ OperationalError: If invite token retrieval fails
2084
+ """
2085
+ get_invite_token_query = """
2086
+ query getInviteToken($input: GetInviteTokenInput!) {
2087
+ getInviteToken(input: $input) {
2088
+ inviteToken
2089
+ }
2090
+ }
2091
+ """
2092
+
2093
+ try:
2094
+ invite_token_response = self.execute_graphql(
2095
+ query=get_invite_token_query,
2096
+ variables={"input": {}},
2097
+ )
2098
+ invite_token = invite_token_response.get("getInviteToken", {}).get(
2099
+ "inviteToken"
2100
+ )
2101
+ if not invite_token:
2102
+ raise OperationalError(
2103
+ "Failed to retrieve invite token. Ensure you have admin permissions.",
2104
+ {},
2105
+ )
2106
+ return invite_token
2107
+ except Exception as e:
2108
+ raise OperationalError(
2109
+ f"Failed to retrieve invite token: {str(e)}", {}
2110
+ ) from e
2111
+
2112
+ def _create_user_with_token(
2113
+ self,
2114
+ user_urn: str,
2115
+ email: str,
2116
+ display_name: str,
2117
+ password: str,
2118
+ invite_token: str,
2119
+ ) -> None:
2120
+ """
2121
+ Create a user using the signup endpoint.
2122
+
2123
+ Args:
2124
+ user_urn: User URN (urn:li:corpuser:{user_id})
2125
+ email: User's email address
2126
+ display_name: Full display name for the user
2127
+ password: User's password
2128
+ invite_token: Invite token for user creation
2129
+
2130
+ Raises:
2131
+ OperationalError: If user creation fails
2132
+ """
2133
+ frontend_url = guess_frontend_url_from_gms_url(self._gms_server)
2134
+ signup_url = f"{frontend_url}/signUp"
2135
+ signup_payload = {
2136
+ "userUrn": user_urn,
2137
+ "email": email,
2138
+ "fullName": display_name,
2139
+ "password": password,
2140
+ "title": "Other",
2141
+ "inviteToken": invite_token,
2142
+ }
2143
+
2144
+ logger.debug(
2145
+ f"Creating user with URN={user_urn}, email={email} at URL: {signup_url}"
2146
+ )
2147
+ logger.debug(
2148
+ f"Signup payload: {json.dumps({**signup_payload, 'password': '***'})}"
2149
+ )
2150
+
2151
+ try:
2152
+ response = self._session.post(signup_url, json=signup_payload)
2153
+ logger.debug(f"Response status code: {response.status_code}")
2154
+ logger.debug(f"Response headers: {dict(response.headers)}")
2155
+ logger.debug(f"Response content length: {len(response.text)}")
2156
+
2157
+ response.raise_for_status()
2158
+
2159
+ # The /signUp endpoint returns 200 with empty body on success
2160
+ logger.debug("User created successfully")
2161
+
2162
+ except HTTPError as http_err:
2163
+ error_details = {
2164
+ "url": signup_url,
2165
+ "status_code": response.status_code,
2166
+ "response_text": response.text[:500],
2167
+ }
2168
+ try:
2169
+ error_json = response.json()
2170
+ error_details["error_response"] = error_json
2171
+ error_msg = error_json.get("message", str(http_err))
2172
+ except JSONDecodeError:
2173
+ error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
2174
+
2175
+ raise OperationalError(
2176
+ f"Failed to create user: {error_msg}",
2177
+ error_details,
2178
+ ) from http_err
2179
+ except Exception as e:
2180
+ raise OperationalError(
2181
+ f"Failed to create user: {str(e)}",
2182
+ {"url": signup_url, "error_type": type(e).__name__},
2183
+ ) from e
2184
+
2185
+ def _assign_role_to_user(self, user_urn: str, role: str) -> None:
2186
+ """
2187
+ Assign a role to a user.
2188
+
2189
+ Args:
2190
+ user_urn: User URN
2191
+ role: Role to assign (Admin, Editor, or Reader)
2192
+
2193
+ Raises:
2194
+ ValueError: If role is invalid
2195
+ """
2196
+ normalized_role = role.capitalize()
2197
+ valid_roles = ["Admin", "Editor", "Reader"]
2198
+ if normalized_role not in valid_roles:
2199
+ raise ValueError(
2200
+ f"Invalid role '{role}'. Must be one of: {', '.join(valid_roles)}"
2201
+ )
2202
+
2203
+ role_urn = f"urn:li:dataHubRole:{normalized_role}"
2204
+
2205
+ batch_assign_role_mutation = """
2206
+ mutation batchAssignRole($input: BatchAssignRoleInput!) {
2207
+ batchAssignRole(input: $input)
2208
+ }
2209
+ """
2210
+
2211
+ try:
2212
+ self.execute_graphql(
2213
+ query=batch_assign_role_mutation,
2214
+ variables={"input": {"roleUrn": role_urn, "actors": [user_urn]}},
2215
+ )
2216
+ except Exception as e:
2217
+ logger.warning(f"Role assignment failed for user {user_urn}: {str(e)}")
2218
+ raise
2219
+
2220
+ def create_native_user(
2221
+ self,
2222
+ user_id: str,
2223
+ email: str,
2224
+ display_name: str,
2225
+ password: str,
2226
+ role: Optional[str] = None,
2227
+ ) -> str:
2228
+ """
2229
+ Create a native DataHub user with email/password authentication.
2230
+
2231
+ Args:
2232
+ user_id: User identifier (will be used in the URN)
2233
+ email: User's email address
2234
+ display_name: Full display name for the user
2235
+ password: User's password
2236
+ role: Optional role to assign (Admin, Editor, or Reader)
2237
+
2238
+ Returns:
2239
+ User URN of the created user (urn:li:corpuser:{user_id})
2240
+
2241
+ Raises:
2242
+ OperationalError: If user creation fails
2243
+ ValueError: If role is invalid
2244
+ """
2245
+ # Validate role before creating user
2246
+ if role:
2247
+ normalized_role = role.capitalize()
2248
+ valid_roles = ["Admin", "Editor", "Reader"]
2249
+ if normalized_role not in valid_roles:
2250
+ raise ValueError(
2251
+ f"Invalid role '{role}'. Must be one of: {', '.join(valid_roles)}"
2252
+ )
2253
+
2254
+ user_urn = f"urn:li:corpuser:{user_id}"
2255
+
2256
+ invite_token = self._get_invite_token()
2257
+ self._create_user_with_token(
2258
+ user_urn, email, display_name, password, invite_token
2259
+ )
2260
+
2261
+ if role:
2262
+ try:
2263
+ self._assign_role_to_user(user_urn, role)
2264
+ except Exception as e:
2265
+ logger.warning(
2266
+ f"User {email} created successfully, but role assignment failed: {str(e)}"
2267
+ )
2268
+
2269
+ return user_urn
2270
+
2074
2271
  def close(self) -> None:
2075
2272
  self._make_schema_resolver.cache_clear()
2076
2273
  super().close()
@@ -1,8 +1,8 @@
1
- import os
2
1
  from enum import Enum, auto
3
2
  from typing import Dict, List, Optional
4
3
 
5
4
  from datahub.configuration.common import ConfigModel
5
+ from datahub.configuration.env_vars import get_datahub_component
6
6
 
7
7
 
8
8
  class ClientMode(Enum):
@@ -11,7 +11,7 @@ class ClientMode(Enum):
11
11
  SDK = auto()
12
12
 
13
13
 
14
- DATAHUB_COMPONENT_ENV: str = os.getenv("DATAHUB_COMPONENT", "datahub").lower()
14
+ DATAHUB_COMPONENT_ENV: str = get_datahub_component().lower()
15
15
 
16
16
 
17
17
  class DatahubClientConfig(ConfigModel):
@@ -3,7 +3,6 @@ import contextlib
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging
6
- import os
7
6
  import threading
8
7
  import uuid
9
8
  from enum import auto
@@ -16,6 +15,10 @@ from datahub.configuration.common import (
16
15
  ConfigurationError,
17
16
  OperationalError,
18
17
  )
18
+ from datahub.configuration.env_vars import (
19
+ get_rest_sink_default_max_threads,
20
+ get_rest_sink_default_mode,
21
+ )
19
22
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
23
  from datahub.emitter.mcp_builder import mcps_from_mce
21
24
  from datahub.emitter.rest_emitter import (
@@ -47,9 +50,7 @@ from datahub.utilities.server_config_util import set_gms_config
47
50
 
48
51
  logger = logging.getLogger(__name__)
49
52
 
50
- _DEFAULT_REST_SINK_MAX_THREADS = int(
51
- os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", 15)
52
- )
53
+ _DEFAULT_REST_SINK_MAX_THREADS = get_rest_sink_default_max_threads()
53
54
 
54
55
 
55
56
  class RestSinkMode(ConfigEnum):
@@ -63,7 +64,7 @@ class RestSinkMode(ConfigEnum):
63
64
 
64
65
 
65
66
  _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
66
- RestSinkMode, os.getenv("DATAHUB_REST_SINK_DEFAULT_MODE", RestSinkMode.ASYNC_BATCH)
67
+ RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
67
68
  )
68
69
 
69
70
 
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from datetime import datetime, timedelta, timezone
4
3
  from enum import Enum
5
4
  from http import HTTPStatus
@@ -17,6 +16,16 @@ from datahub.configuration.common import (
17
16
  ConfigModel,
18
17
  PermissiveConfigModel,
19
18
  )
19
+ from datahub.configuration.env_vars import (
20
+ get_aws_app_runner_service_id,
21
+ get_aws_execution_env,
22
+ get_aws_lambda_function_name,
23
+ get_aws_role_arn,
24
+ get_aws_web_identity_token_file,
25
+ get_ecs_container_metadata_uri,
26
+ get_ecs_container_metadata_uri_v4,
27
+ get_elastic_beanstalk_environment_name,
28
+ )
20
29
  from datahub.configuration.source_common import EnvConfigMixin
21
30
 
22
31
  logger = logging.getLogger(__name__)
@@ -100,27 +109,25 @@ def detect_aws_environment() -> AwsEnvironment:
100
109
  Order matters as some environments may have multiple indicators.
101
110
  """
102
111
  # Check Lambda first as it's most specific
103
- if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
104
- if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
112
+ if get_aws_lambda_function_name():
113
+ if (get_aws_execution_env() or "").startswith("CloudFormation"):
105
114
  return AwsEnvironment.CLOUD_FORMATION
106
115
  return AwsEnvironment.LAMBDA
107
116
 
108
117
  # Check EKS (IRSA)
109
- if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
118
+ if get_aws_web_identity_token_file() and get_aws_role_arn():
110
119
  return AwsEnvironment.EKS
111
120
 
112
121
  # Check App Runner
113
- if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
122
+ if get_aws_app_runner_service_id():
114
123
  return AwsEnvironment.APP_RUNNER
115
124
 
116
125
  # Check ECS
117
- if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
118
- "ECS_CONTAINER_METADATA_URI"
119
- ):
126
+ if get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri():
120
127
  return AwsEnvironment.ECS
121
128
 
122
129
  # Check Elastic Beanstalk
123
- if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
130
+ if get_elastic_beanstalk_environment_name():
124
131
  return AwsEnvironment.BEANSTALK
125
132
 
126
133
  if is_running_on_ec2():
@@ -155,7 +162,7 @@ def get_instance_role_arn() -> Optional[str]:
155
162
  def get_lambda_role_arn() -> Optional[str]:
156
163
  """Get the Lambda function's role ARN"""
157
164
  try:
158
- function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
165
+ function_name = get_aws_lambda_function_name()
159
166
  if not function_name:
160
167
  return None
161
168
 
@@ -181,7 +188,7 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
181
188
  return role_arn, AwsServicePrincipal.LAMBDA.value
182
189
 
183
190
  elif env == AwsEnvironment.EKS:
184
- role_arn = os.getenv("AWS_ROLE_ARN")
191
+ role_arn = get_aws_role_arn()
185
192
  return role_arn, AwsServicePrincipal.EKS.value
186
193
 
187
194
  elif env == AwsEnvironment.APP_RUNNER:
@@ -194,8 +201,8 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
194
201
 
195
202
  elif env == AwsEnvironment.ECS:
196
203
  try:
197
- metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
198
- "ECS_CONTAINER_METADATA_URI"
204
+ metadata_uri = (
205
+ get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri()
199
206
  )
200
207
  if metadata_uri:
201
208
  response = requests.get(f"{metadata_uri}/task", timeout=1)
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  import re
4
3
  from copy import deepcopy
5
4
  from datetime import timedelta
@@ -8,6 +7,7 @@ from typing import Dict, List, Optional, Union
8
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
9
8
 
10
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
+ from datahub.configuration.env_vars import get_bigquery_schema_parallelism
11
11
  from datahub.configuration.source_common import (
12
12
  EnvConfigMixin,
13
13
  LowerCaseDatasetUrnConfigMixin,
@@ -31,9 +31,7 @@ from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
34
- DEFAULT_BQ_SCHEMA_PARALLELISM = int(
35
- os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
36
- )
34
+ DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
37
35
 
38
36
  # Regexp for sharded tables.
39
37
  # A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
@@ -92,6 +92,7 @@ class Dashboard(_GrafanaBaseModel):
92
92
  """Custom parsing to handle nested panel extraction."""
93
93
  dashboard_data = data.get("dashboard", {})
94
94
  _panel_data = dashboard_data.get("panels", [])
95
+ panels = []
95
96
  try:
96
97
  panels = cls.extract_panels(_panel_data)
97
98
  except Exception as e:
@@ -108,6 +109,10 @@ class Dashboard(_GrafanaBaseModel):
108
109
  if "meta" in dashboard_dict:
109
110
  del dashboard_dict["meta"]
110
111
 
112
+ # Handle refresh field type mismatch - convert boolean to string
113
+ if "refresh" in dashboard_dict and isinstance(dashboard_dict["refresh"], bool):
114
+ dashboard_dict["refresh"] = str(dashboard_dict["refresh"])
115
+
111
116
  return super().parse_obj(dashboard_dict)
112
117
 
113
118