acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
146
146
|
aspect_value=source_info_aspect,
|
|
147
147
|
)
|
|
148
148
|
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _convert_sets_to_lists(obj: Any) -> Any:
|
|
151
|
+
"""
|
|
152
|
+
Recursively converts all sets to lists in a Python object.
|
|
153
|
+
Works with nested dictionaries, lists, and sets.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
obj: Any Python object that might contain sets
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
The object with all sets converted to lists
|
|
160
|
+
"""
|
|
161
|
+
if isinstance(obj, dict):
|
|
162
|
+
return {
|
|
163
|
+
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
|
|
164
|
+
for key, value in obj.items()
|
|
165
|
+
}
|
|
166
|
+
elif isinstance(obj, list):
|
|
167
|
+
return [
|
|
168
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
169
|
+
for element in obj
|
|
170
|
+
]
|
|
171
|
+
elif isinstance(obj, set):
|
|
172
|
+
return [
|
|
173
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
174
|
+
for element in obj
|
|
175
|
+
]
|
|
176
|
+
elif isinstance(obj, tuple):
|
|
177
|
+
return tuple(
|
|
178
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
|
179
|
+
for element in obj
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
return obj
|
|
183
|
+
|
|
149
184
|
def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
|
|
150
185
|
assert ctx.pipeline_config
|
|
151
186
|
if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
|
|
152
187
|
return ""
|
|
153
188
|
else:
|
|
154
|
-
|
|
189
|
+
redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
|
|
190
|
+
# This is required otherwise json dumps will fail
|
|
191
|
+
# with a TypeError: Object of type set is not JSON serializable
|
|
192
|
+
converted_recipe = (
|
|
193
|
+
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
|
|
194
|
+
redacted_recipe
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
return json.dumps(converted_recipe)
|
|
155
198
|
|
|
156
199
|
def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
|
|
157
200
|
self.sink.write_record_async(
|
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
1
3
|
from datetime import datetime, timedelta, timezone
|
|
2
|
-
from
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from http import HTTPStatus
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
|
3
7
|
|
|
4
8
|
import boto3
|
|
9
|
+
import requests
|
|
5
10
|
from boto3.session import Session
|
|
6
11
|
from botocore.config import DEFAULT_TIMEOUT, Config
|
|
7
12
|
from botocore.utils import fix_s3_host
|
|
@@ -14,6 +19,8 @@ from datahub.configuration.common import (
|
|
|
14
19
|
)
|
|
15
20
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
16
21
|
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
17
24
|
if TYPE_CHECKING:
|
|
18
25
|
from mypy_boto3_dynamodb import DynamoDBClient
|
|
19
26
|
from mypy_boto3_glue import GlueClient
|
|
@@ -22,6 +29,26 @@ if TYPE_CHECKING:
|
|
|
22
29
|
from mypy_boto3_sts import STSClient
|
|
23
30
|
|
|
24
31
|
|
|
32
|
+
class AwsEnvironment(Enum):
|
|
33
|
+
EC2 = "EC2"
|
|
34
|
+
ECS = "ECS"
|
|
35
|
+
EKS = "EKS"
|
|
36
|
+
LAMBDA = "LAMBDA"
|
|
37
|
+
APP_RUNNER = "APP_RUNNER"
|
|
38
|
+
BEANSTALK = "ELASTIC_BEANSTALK"
|
|
39
|
+
CLOUD_FORMATION = "CLOUD_FORMATION"
|
|
40
|
+
UNKNOWN = "UNKNOWN"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AwsServicePrincipal(Enum):
|
|
44
|
+
LAMBDA = "lambda.amazonaws.com"
|
|
45
|
+
EKS = "eks.amazonaws.com"
|
|
46
|
+
APP_RUNNER = "apprunner.amazonaws.com"
|
|
47
|
+
ECS = "ecs.amazonaws.com"
|
|
48
|
+
ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com"
|
|
49
|
+
EC2 = "ec2.amazonaws.com"
|
|
50
|
+
|
|
51
|
+
|
|
25
52
|
class AwsAssumeRoleConfig(PermissiveConfigModel):
|
|
26
53
|
# Using the PermissiveConfigModel to allow the user to pass additional arguments.
|
|
27
54
|
|
|
@@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel):
|
|
|
34
61
|
)
|
|
35
62
|
|
|
36
63
|
|
|
64
|
+
def get_instance_metadata_token() -> Optional[str]:
|
|
65
|
+
"""Get IMDSv2 token"""
|
|
66
|
+
try:
|
|
67
|
+
response = requests.put(
|
|
68
|
+
"http://169.254.169.254/latest/api/token",
|
|
69
|
+
headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
|
|
70
|
+
timeout=1,
|
|
71
|
+
)
|
|
72
|
+
if response.status_code == HTTPStatus.OK:
|
|
73
|
+
return response.text
|
|
74
|
+
except requests.exceptions.RequestException:
|
|
75
|
+
logger.debug("Failed to get IMDSv2 token")
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def is_running_on_ec2() -> bool:
|
|
80
|
+
"""Check if code is running on EC2 using IMDSv2"""
|
|
81
|
+
token = get_instance_metadata_token()
|
|
82
|
+
if not token:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
response = requests.get(
|
|
87
|
+
"http://169.254.169.254/latest/meta-data/instance-id",
|
|
88
|
+
headers={"X-aws-ec2-metadata-token": token},
|
|
89
|
+
timeout=1,
|
|
90
|
+
)
|
|
91
|
+
return response.status_code == HTTPStatus.OK
|
|
92
|
+
except requests.exceptions.RequestException:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def detect_aws_environment() -> AwsEnvironment:
|
|
97
|
+
"""
|
|
98
|
+
Detect the AWS environment we're running in.
|
|
99
|
+
Order matters as some environments may have multiple indicators.
|
|
100
|
+
"""
|
|
101
|
+
# Check Lambda first as it's most specific
|
|
102
|
+
if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
|
|
103
|
+
if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
|
|
104
|
+
return AwsEnvironment.CLOUD_FORMATION
|
|
105
|
+
return AwsEnvironment.LAMBDA
|
|
106
|
+
|
|
107
|
+
# Check EKS (IRSA)
|
|
108
|
+
if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
|
|
109
|
+
return AwsEnvironment.EKS
|
|
110
|
+
|
|
111
|
+
# Check App Runner
|
|
112
|
+
if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
|
|
113
|
+
return AwsEnvironment.APP_RUNNER
|
|
114
|
+
|
|
115
|
+
# Check ECS
|
|
116
|
+
if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
|
|
117
|
+
"ECS_CONTAINER_METADATA_URI"
|
|
118
|
+
):
|
|
119
|
+
return AwsEnvironment.ECS
|
|
120
|
+
|
|
121
|
+
# Check Elastic Beanstalk
|
|
122
|
+
if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
|
|
123
|
+
return AwsEnvironment.BEANSTALK
|
|
124
|
+
|
|
125
|
+
if is_running_on_ec2():
|
|
126
|
+
return AwsEnvironment.EC2
|
|
127
|
+
|
|
128
|
+
return AwsEnvironment.UNKNOWN
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_instance_role_arn() -> Optional[str]:
|
|
132
|
+
"""Get role ARN from EC2 instance metadata using IMDSv2"""
|
|
133
|
+
token = get_instance_metadata_token()
|
|
134
|
+
if not token:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
response = requests.get(
|
|
139
|
+
"http://169.254.169.254/latest/meta-data/iam/security-credentials/",
|
|
140
|
+
headers={"X-aws-ec2-metadata-token": token},
|
|
141
|
+
timeout=1,
|
|
142
|
+
)
|
|
143
|
+
if response.status_code == 200:
|
|
144
|
+
role_name = response.text.strip()
|
|
145
|
+
if role_name:
|
|
146
|
+
sts = boto3.client("sts")
|
|
147
|
+
identity = sts.get_caller_identity()
|
|
148
|
+
return identity.get("Arn")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.debug(f"Failed to get instance role ARN: {e}")
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_lambda_role_arn() -> Optional[str]:
|
|
155
|
+
"""Get the Lambda function's role ARN"""
|
|
156
|
+
try:
|
|
157
|
+
function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
|
|
158
|
+
if not function_name:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
lambda_client = boto3.client("lambda")
|
|
162
|
+
function_config = lambda_client.get_function_configuration(
|
|
163
|
+
FunctionName=function_name
|
|
164
|
+
)
|
|
165
|
+
return function_config.get("Role")
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.debug(f"Failed to get Lambda role ARN: {e}")
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
|
|
172
|
+
"""
|
|
173
|
+
Get the current role ARN and source type based on the runtime environment.
|
|
174
|
+
Returns (role_arn, credential_source)
|
|
175
|
+
"""
|
|
176
|
+
env = detect_aws_environment()
|
|
177
|
+
|
|
178
|
+
if env == AwsEnvironment.LAMBDA:
|
|
179
|
+
role_arn = get_lambda_role_arn()
|
|
180
|
+
return role_arn, AwsServicePrincipal.LAMBDA.value
|
|
181
|
+
|
|
182
|
+
elif env == AwsEnvironment.EKS:
|
|
183
|
+
role_arn = os.getenv("AWS_ROLE_ARN")
|
|
184
|
+
return role_arn, AwsServicePrincipal.EKS.value
|
|
185
|
+
|
|
186
|
+
elif env == AwsEnvironment.APP_RUNNER:
|
|
187
|
+
try:
|
|
188
|
+
sts = boto3.client("sts")
|
|
189
|
+
identity = sts.get_caller_identity()
|
|
190
|
+
return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logger.debug(f"Failed to get App Runner role: {e}")
|
|
193
|
+
|
|
194
|
+
elif env == AwsEnvironment.ECS:
|
|
195
|
+
try:
|
|
196
|
+
metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
|
|
197
|
+
"ECS_CONTAINER_METADATA_URI"
|
|
198
|
+
)
|
|
199
|
+
if metadata_uri:
|
|
200
|
+
response = requests.get(f"{metadata_uri}/task", timeout=1)
|
|
201
|
+
if response.status_code == HTTPStatus.OK:
|
|
202
|
+
task_metadata = response.json()
|
|
203
|
+
if "TaskARN" in task_metadata:
|
|
204
|
+
return (
|
|
205
|
+
task_metadata.get("TaskARN"),
|
|
206
|
+
AwsServicePrincipal.ECS.value,
|
|
207
|
+
)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.debug(f"Failed to get ECS task role: {e}")
|
|
210
|
+
|
|
211
|
+
elif env == AwsEnvironment.BEANSTALK:
|
|
212
|
+
# Beanstalk uses EC2 instance metadata
|
|
213
|
+
return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value
|
|
214
|
+
|
|
215
|
+
elif env == AwsEnvironment.EC2:
|
|
216
|
+
return get_instance_role_arn(), AwsServicePrincipal.EC2.value
|
|
217
|
+
|
|
218
|
+
return None, None
|
|
219
|
+
|
|
220
|
+
|
|
37
221
|
def assume_role(
|
|
38
222
|
role: AwsAssumeRoleConfig,
|
|
39
223
|
aws_region: Optional[str],
|
|
@@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
95
279
|
)
|
|
96
280
|
aws_profile: Optional[str] = Field(
|
|
97
281
|
default=None,
|
|
98
|
-
description="
|
|
282
|
+
description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.",
|
|
99
283
|
)
|
|
100
284
|
aws_region: Optional[str] = Field(None, description="AWS region code.")
|
|
101
285
|
|
|
@@ -145,6 +329,7 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
145
329
|
|
|
146
330
|
def get_session(self) -> Session:
|
|
147
331
|
if self.aws_access_key_id and self.aws_secret_access_key:
|
|
332
|
+
# Explicit credentials take precedence
|
|
148
333
|
session = Session(
|
|
149
334
|
aws_access_key_id=self.aws_access_key_id,
|
|
150
335
|
aws_secret_access_key=self.aws_secret_access_key,
|
|
@@ -152,38 +337,57 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
152
337
|
region_name=self.aws_region,
|
|
153
338
|
)
|
|
154
339
|
elif self.aws_profile:
|
|
340
|
+
# Named profile is second priority
|
|
155
341
|
session = Session(
|
|
156
342
|
region_name=self.aws_region, profile_name=self.aws_profile
|
|
157
343
|
)
|
|
158
344
|
else:
|
|
159
|
-
# Use boto3's credential autodetection
|
|
345
|
+
# Use boto3's credential autodetection
|
|
160
346
|
session = Session(region_name=self.aws_region)
|
|
161
347
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
348
|
+
target_roles = self._normalized_aws_roles()
|
|
349
|
+
if target_roles:
|
|
350
|
+
current_role_arn, credential_source = get_current_identity()
|
|
351
|
+
|
|
352
|
+
# Only assume role if:
|
|
353
|
+
# 1. We're not in a known AWS environment with a role, or
|
|
354
|
+
# 2. We need to assume a different role than our current one
|
|
355
|
+
should_assume_role = current_role_arn is None or any(
|
|
356
|
+
role.RoleArn != current_role_arn for role in target_roles
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
if should_assume_role:
|
|
360
|
+
env = detect_aws_environment()
|
|
361
|
+
logger.debug(f"Assuming role(s) from {env.value} environment")
|
|
362
|
+
|
|
363
|
+
current_credentials = session.get_credentials()
|
|
364
|
+
if current_credentials is None:
|
|
365
|
+
raise ValueError("No credentials available for role assumption")
|
|
366
|
+
|
|
367
|
+
credentials = {
|
|
368
|
+
"AccessKeyId": current_credentials.access_key,
|
|
369
|
+
"SecretAccessKey": current_credentials.secret_key,
|
|
370
|
+
"SessionToken": current_credentials.token,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
for role in target_roles:
|
|
374
|
+
if self._should_refresh_credentials():
|
|
375
|
+
credentials = assume_role(
|
|
376
|
+
role=role,
|
|
377
|
+
aws_region=self.aws_region,
|
|
378
|
+
credentials=credentials,
|
|
379
|
+
)
|
|
380
|
+
if isinstance(credentials["Expiration"], datetime):
|
|
381
|
+
self._credentials_expiration = credentials["Expiration"]
|
|
382
|
+
|
|
383
|
+
session = Session(
|
|
384
|
+
aws_access_key_id=credentials["AccessKeyId"],
|
|
385
|
+
aws_secret_access_key=credentials["SecretAccessKey"],
|
|
386
|
+
aws_session_token=credentials["SessionToken"],
|
|
387
|
+
region_name=self.aws_region,
|
|
177
388
|
)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
session = Session(
|
|
182
|
-
aws_access_key_id=credentials["AccessKeyId"],
|
|
183
|
-
aws_secret_access_key=credentials["SecretAccessKey"],
|
|
184
|
-
aws_session_token=credentials["SessionToken"],
|
|
185
|
-
region_name=self.aws_region,
|
|
186
|
-
)
|
|
389
|
+
else:
|
|
390
|
+
logger.debug(f"Using existing role from {credential_source}")
|
|
187
391
|
|
|
188
392
|
return session
|
|
189
393
|
|
|
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
52
52
|
platform_name,
|
|
53
53
|
support_status,
|
|
54
54
|
)
|
|
55
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
55
56
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
56
57
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
57
58
|
from datahub.ingestion.source.aws import s3_util
|
|
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
|
|
|
115
116
|
|
|
116
117
|
logger = logging.getLogger(__name__)
|
|
117
118
|
|
|
118
|
-
|
|
119
119
|
DEFAULT_PLATFORM = "glue"
|
|
120
120
|
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
|
|
121
121
|
|
|
@@ -220,6 +220,7 @@ class GlueSourceConfig(
|
|
|
220
220
|
class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
221
221
|
tables_scanned = 0
|
|
222
222
|
filtered: List[str] = dataclass_field(default_factory=list)
|
|
223
|
+
databases: EntityFilterReport = EntityFilterReport.field(type="database")
|
|
223
224
|
|
|
224
225
|
num_job_script_location_missing: int = 0
|
|
225
226
|
num_job_script_location_invalid: int = 0
|
|
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
668
669
|
return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
|
|
669
670
|
|
|
670
671
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
672
|
+
logger.debug("Getting all databases")
|
|
671
673
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
|
|
672
674
|
paginator = self.glue_client.get_paginator("get_databases")
|
|
673
675
|
|
|
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
684
686
|
pattern += "[?!TargetDatabase]"
|
|
685
687
|
|
|
686
688
|
for database in paginator_response.search(pattern):
|
|
687
|
-
if self.source_config.database_pattern.allowed(database["Name"])
|
|
689
|
+
if (not self.source_config.database_pattern.allowed(database["Name"])) or (
|
|
690
|
+
self.source_config.catalog_id
|
|
691
|
+
and database.get("CatalogId")
|
|
692
|
+
and database.get("CatalogId") != self.source_config.catalog_id
|
|
693
|
+
):
|
|
694
|
+
self.report.databases.dropped(database["Name"])
|
|
695
|
+
else:
|
|
696
|
+
self.report.databases.processed(database["Name"])
|
|
688
697
|
yield database
|
|
689
698
|
|
|
690
699
|
def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
|
|
700
|
+
logger.debug(f"Getting tables from database {database['Name']}")
|
|
691
701
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
|
|
692
702
|
paginator = self.glue_client.get_paginator("get_tables")
|
|
693
703
|
database_name = database["Name"]
|
|
@@ -206,9 +206,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
206
206
|
|
|
207
207
|
def _init_schema_resolver(self) -> SchemaResolver:
|
|
208
208
|
schema_resolution_required = (
|
|
209
|
-
self.config.use_queries_v2
|
|
210
|
-
or self.config.lineage_parse_view_ddl
|
|
211
|
-
or self.config.lineage_use_sql_parser
|
|
209
|
+
self.config.use_queries_v2 or self.config.lineage_use_sql_parser
|
|
212
210
|
)
|
|
213
211
|
schema_ingestion_enabled = (
|
|
214
212
|
self.config.include_schema_metadata
|
|
@@ -255,18 +253,16 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
255
253
|
for project in projects:
|
|
256
254
|
yield from self.bq_schema_extractor.get_project_workunits(project)
|
|
257
255
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
self.bq_schema_extractor.snapshot_refs_by_project,
|
|
267
|
-
self.bq_schema_extractor.snapshots_by_ref,
|
|
268
|
-
)
|
|
256
|
+
self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
|
|
257
|
+
yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
|
|
258
|
+
[p.id for p in projects],
|
|
259
|
+
self.bq_schema_extractor.view_refs_by_project,
|
|
260
|
+
self.bq_schema_extractor.view_definitions,
|
|
261
|
+
self.bq_schema_extractor.snapshot_refs_by_project,
|
|
262
|
+
self.bq_schema_extractor.snapshots_by_ref,
|
|
263
|
+
)
|
|
269
264
|
|
|
265
|
+
if self.config.use_queries_v2:
|
|
270
266
|
# if both usage and lineage are disabled then skip queries extractor piece
|
|
271
267
|
if (
|
|
272
268
|
not self.config.include_usage_statistics
|
|
@@ -306,10 +302,6 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
306
302
|
if self.config.include_table_lineage:
|
|
307
303
|
yield from self.lineage_extractor.get_lineage_workunits(
|
|
308
304
|
[p.id for p in projects],
|
|
309
|
-
self.bq_schema_extractor.view_refs_by_project,
|
|
310
|
-
self.bq_schema_extractor.view_definitions,
|
|
311
|
-
self.bq_schema_extractor.snapshot_refs_by_project,
|
|
312
|
-
self.bq_schema_extractor.snapshots_by_ref,
|
|
313
305
|
self.bq_schema_extractor.table_refs,
|
|
314
306
|
)
|
|
315
307
|
|
|
@@ -463,10 +463,6 @@ class BigQueryV2Config(
|
|
|
463
463
|
default=True,
|
|
464
464
|
description="Use sql parser to resolve view/table lineage.",
|
|
465
465
|
)
|
|
466
|
-
lineage_parse_view_ddl: bool = Field(
|
|
467
|
-
default=True,
|
|
468
|
-
description="Sql parse view ddl to get lineage.",
|
|
469
|
-
)
|
|
470
466
|
|
|
471
467
|
lineage_sql_parser_use_raw_names: bool = Field(
|
|
472
468
|
default=False,
|
|
@@ -572,11 +568,9 @@ class BigQueryV2Config(
|
|
|
572
568
|
"See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.",
|
|
573
569
|
)
|
|
574
570
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
include_view_column_lineage: bool = Field(default=True, hidden_from_docs=True)
|
|
571
|
+
_include_view_lineage = pydantic_removed_field("include_view_lineage")
|
|
572
|
+
_include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
|
|
573
|
+
_lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
|
|
580
574
|
|
|
581
575
|
@root_validator(pre=True)
|
|
582
576
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
from functools import lru_cache
|
|
6
6
|
from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional
|
|
7
7
|
|
|
@@ -15,6 +15,7 @@ from google.cloud.bigquery.table import (
|
|
|
15
15
|
TimePartitioningType,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
18
19
|
from datahub.ingestion.api.source import SourceReport
|
|
19
20
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
|
20
21
|
from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
|
|
@@ -393,13 +394,7 @@ class BigQuerySchemaApi:
|
|
|
393
394
|
name=table.table_name,
|
|
394
395
|
created=table.created,
|
|
395
396
|
table_type=table.table_type,
|
|
396
|
-
last_altered=(
|
|
397
|
-
datetime.fromtimestamp(
|
|
398
|
-
table.get("last_altered") / 1000, tz=timezone.utc
|
|
399
|
-
)
|
|
400
|
-
if table.get("last_altered") is not None
|
|
401
|
-
else None
|
|
402
|
-
),
|
|
397
|
+
last_altered=parse_ts_millis(table.get("last_altered")),
|
|
403
398
|
size_in_bytes=table.get("bytes"),
|
|
404
399
|
rows_count=table.get("row_count"),
|
|
405
400
|
comment=table.comment,
|
|
@@ -460,11 +455,7 @@ class BigQuerySchemaApi:
|
|
|
460
455
|
return BigqueryView(
|
|
461
456
|
name=view.table_name,
|
|
462
457
|
created=view.created,
|
|
463
|
-
last_altered=(
|
|
464
|
-
datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
|
|
465
|
-
if view.get("last_altered") is not None
|
|
466
|
-
else None
|
|
467
|
-
),
|
|
458
|
+
last_altered=(parse_ts_millis(view.get("last_altered"))),
|
|
468
459
|
comment=view.comment,
|
|
469
460
|
view_definition=view.view_definition,
|
|
470
461
|
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
|
|
@@ -705,13 +696,7 @@ class BigQuerySchemaApi:
|
|
|
705
696
|
return BigqueryTableSnapshot(
|
|
706
697
|
name=snapshot.table_name,
|
|
707
698
|
created=snapshot.created,
|
|
708
|
-
last_altered=(
|
|
709
|
-
datetime.fromtimestamp(
|
|
710
|
-
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
|
711
|
-
)
|
|
712
|
-
if snapshot.get("last_altered") is not None
|
|
713
|
-
else None
|
|
714
|
-
),
|
|
699
|
+
last_altered=parse_ts_millis(snapshot.get("last_altered")),
|
|
715
700
|
comment=snapshot.comment,
|
|
716
701
|
ddl=snapshot.ddl,
|
|
717
702
|
snapshot_time=snapshot.snapshot_time,
|
|
@@ -653,14 +653,11 @@ class BigQuerySchemaGenerator:
|
|
|
653
653
|
self.report.report_dropped(table_identifier.raw_table_name())
|
|
654
654
|
return
|
|
655
655
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
)
|
|
660
|
-
self.
|
|
661
|
-
if self.config.lineage_parse_view_ddl and view.view_definition:
|
|
662
|
-
self.view_refs_by_project[project_id].add(table_ref)
|
|
663
|
-
self.view_definitions[table_ref] = view.view_definition
|
|
656
|
+
table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
|
|
657
|
+
self.table_refs.add(table_ref)
|
|
658
|
+
if view.view_definition:
|
|
659
|
+
self.view_refs_by_project[project_id].add(table_ref)
|
|
660
|
+
self.view_definitions[table_ref] = view.view_definition
|
|
664
661
|
|
|
665
662
|
view.column_count = len(columns)
|
|
666
663
|
if not view.column_count:
|
|
@@ -701,14 +698,11 @@ class BigQuerySchemaGenerator:
|
|
|
701
698
|
f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}"
|
|
702
699
|
)
|
|
703
700
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
)
|
|
708
|
-
self.
|
|
709
|
-
if snapshot.base_table_identifier:
|
|
710
|
-
self.snapshot_refs_by_project[project_id].add(table_ref)
|
|
711
|
-
self.snapshots_by_ref[table_ref] = snapshot
|
|
701
|
+
table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
|
|
702
|
+
self.table_refs.add(table_ref)
|
|
703
|
+
if snapshot.base_table_identifier:
|
|
704
|
+
self.snapshot_refs_by_project[project_id].add(table_ref)
|
|
705
|
+
self.snapshots_by_ref[table_ref] = snapshot
|
|
712
706
|
|
|
713
707
|
yield from self.gen_snapshot_dataset_workunits(
|
|
714
708
|
table=snapshot,
|
|
@@ -1148,7 +1142,7 @@ class BigQuerySchemaGenerator:
|
|
|
1148
1142
|
foreignKeys=foreign_keys if foreign_keys else None,
|
|
1149
1143
|
)
|
|
1150
1144
|
|
|
1151
|
-
if self.config.
|
|
1145
|
+
if self.config.lineage_use_sql_parser:
|
|
1152
1146
|
self.sql_parser_schema_resolver.add_schema_metadata(
|
|
1153
1147
|
dataset_urn, schema_metadata
|
|
1154
1148
|
)
|
|
@@ -291,16 +291,15 @@ class BigqueryLineageExtractor:
|
|
|
291
291
|
snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
|
|
292
292
|
) -> Iterable[MetadataWorkUnit]:
|
|
293
293
|
for project in projects:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
self.
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
)
|
|
294
|
+
for view in view_refs_by_project[project]:
|
|
295
|
+
self.datasets_skip_audit_log_lineage.add(view)
|
|
296
|
+
self.aggregator.add_view_definition(
|
|
297
|
+
view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
|
|
298
|
+
BigQueryTableRef.from_string_name(view)
|
|
299
|
+
),
|
|
300
|
+
view_definition=view_definitions[view],
|
|
301
|
+
default_db=project,
|
|
302
|
+
)
|
|
304
303
|
|
|
305
304
|
for snapshot_ref in snapshot_refs_by_project[project]:
|
|
306
305
|
snapshot = snapshots_by_ref[snapshot_ref]
|
|
@@ -322,23 +321,11 @@ class BigqueryLineageExtractor:
|
|
|
322
321
|
def get_lineage_workunits(
|
|
323
322
|
self,
|
|
324
323
|
projects: List[str],
|
|
325
|
-
view_refs_by_project: Dict[str, Set[str]],
|
|
326
|
-
view_definitions: FileBackedDict[str],
|
|
327
|
-
snapshot_refs_by_project: Dict[str, Set[str]],
|
|
328
|
-
snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
|
|
329
324
|
table_refs: Set[str],
|
|
330
325
|
) -> Iterable[MetadataWorkUnit]:
|
|
331
326
|
if not self._should_ingest_lineage():
|
|
332
327
|
return
|
|
333
328
|
|
|
334
|
-
yield from self.get_lineage_workunits_for_views_and_snapshots(
|
|
335
|
-
projects,
|
|
336
|
-
view_refs_by_project,
|
|
337
|
-
view_definitions,
|
|
338
|
-
snapshot_refs_by_project,
|
|
339
|
-
snapshots_by_ref,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
329
|
if self.config.use_exported_bigquery_audit_metadata:
|
|
343
330
|
projects = ["*"] # project_id not used when using exported metadata
|
|
344
331
|
|