acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
146
146
  aspect_value=source_info_aspect,
147
147
  )
148
148
 
149
+ @staticmethod
150
+ def _convert_sets_to_lists(obj: Any) -> Any:
151
+ """
152
+ Recursively converts all sets to lists in a Python object.
153
+ Works with nested dictionaries, lists, and sets.
154
+
155
+ Args:
156
+ obj: Any Python object that might contain sets
157
+
158
+ Returns:
159
+ The object with all sets converted to lists
160
+ """
161
+ if isinstance(obj, dict):
162
+ return {
163
+ key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
164
+ for key, value in obj.items()
165
+ }
166
+ elif isinstance(obj, list):
167
+ return [
168
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
169
+ for element in obj
170
+ ]
171
+ elif isinstance(obj, set):
172
+ return [
173
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
174
+ for element in obj
175
+ ]
176
+ elif isinstance(obj, tuple):
177
+ return tuple(
178
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
179
+ for element in obj
180
+ )
181
+ else:
182
+ return obj
183
+
149
184
  def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
150
185
  assert ctx.pipeline_config
151
186
  if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
152
187
  return ""
153
188
  else:
154
- return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
189
+ redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
190
+ # This is required otherwise json dumps will fail
191
+ # with a TypeError: Object of type set is not JSON serializable
192
+ converted_recipe = (
193
+ DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
194
+ redacted_recipe
195
+ )
196
+ )
197
+ return json.dumps(converted_recipe)
155
198
 
156
199
  def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
157
200
  self.sink.write_record_async(
@@ -1,7 +1,12 @@
1
+ import logging
2
+ import os
1
3
  from datetime import datetime, timedelta, timezone
2
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
4
+ from enum import Enum
5
+ from http import HTTPStatus
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
3
7
 
4
8
  import boto3
9
+ import requests
5
10
  from boto3.session import Session
6
11
  from botocore.config import DEFAULT_TIMEOUT, Config
7
12
  from botocore.utils import fix_s3_host
@@ -14,6 +19,8 @@ from datahub.configuration.common import (
14
19
  )
15
20
  from datahub.configuration.source_common import EnvConfigMixin
16
21
 
22
+ logger = logging.getLogger(__name__)
23
+
17
24
  if TYPE_CHECKING:
18
25
  from mypy_boto3_dynamodb import DynamoDBClient
19
26
  from mypy_boto3_glue import GlueClient
@@ -22,6 +29,26 @@ if TYPE_CHECKING:
22
29
  from mypy_boto3_sts import STSClient
23
30
 
24
31
 
32
+ class AwsEnvironment(Enum):
33
+ EC2 = "EC2"
34
+ ECS = "ECS"
35
+ EKS = "EKS"
36
+ LAMBDA = "LAMBDA"
37
+ APP_RUNNER = "APP_RUNNER"
38
+ BEANSTALK = "ELASTIC_BEANSTALK"
39
+ CLOUD_FORMATION = "CLOUD_FORMATION"
40
+ UNKNOWN = "UNKNOWN"
41
+
42
+
43
+ class AwsServicePrincipal(Enum):
44
+ LAMBDA = "lambda.amazonaws.com"
45
+ EKS = "eks.amazonaws.com"
46
+ APP_RUNNER = "apprunner.amazonaws.com"
47
+ ECS = "ecs.amazonaws.com"
48
+ ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com"
49
+ EC2 = "ec2.amazonaws.com"
50
+
51
+
25
52
  class AwsAssumeRoleConfig(PermissiveConfigModel):
26
53
  # Using the PermissiveConfigModel to allow the user to pass additional arguments.
27
54
 
@@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel):
34
61
  )
35
62
 
36
63
 
64
+ def get_instance_metadata_token() -> Optional[str]:
65
+ """Get IMDSv2 token"""
66
+ try:
67
+ response = requests.put(
68
+ "http://169.254.169.254/latest/api/token",
69
+ headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
70
+ timeout=1,
71
+ )
72
+ if response.status_code == HTTPStatus.OK:
73
+ return response.text
74
+ except requests.exceptions.RequestException:
75
+ logger.debug("Failed to get IMDSv2 token")
76
+ return None
77
+
78
+
79
+ def is_running_on_ec2() -> bool:
80
+ """Check if code is running on EC2 using IMDSv2"""
81
+ token = get_instance_metadata_token()
82
+ if not token:
83
+ return False
84
+
85
+ try:
86
+ response = requests.get(
87
+ "http://169.254.169.254/latest/meta-data/instance-id",
88
+ headers={"X-aws-ec2-metadata-token": token},
89
+ timeout=1,
90
+ )
91
+ return response.status_code == HTTPStatus.OK
92
+ except requests.exceptions.RequestException:
93
+ return False
94
+
95
+
96
+ def detect_aws_environment() -> AwsEnvironment:
97
+ """
98
+ Detect the AWS environment we're running in.
99
+ Order matters as some environments may have multiple indicators.
100
+ """
101
+ # Check Lambda first as it's most specific
102
+ if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
103
+ if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
104
+ return AwsEnvironment.CLOUD_FORMATION
105
+ return AwsEnvironment.LAMBDA
106
+
107
+ # Check EKS (IRSA)
108
+ if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
109
+ return AwsEnvironment.EKS
110
+
111
+ # Check App Runner
112
+ if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
113
+ return AwsEnvironment.APP_RUNNER
114
+
115
+ # Check ECS
116
+ if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
117
+ "ECS_CONTAINER_METADATA_URI"
118
+ ):
119
+ return AwsEnvironment.ECS
120
+
121
+ # Check Elastic Beanstalk
122
+ if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
123
+ return AwsEnvironment.BEANSTALK
124
+
125
+ if is_running_on_ec2():
126
+ return AwsEnvironment.EC2
127
+
128
+ return AwsEnvironment.UNKNOWN
129
+
130
+
131
+ def get_instance_role_arn() -> Optional[str]:
132
+ """Get role ARN from EC2 instance metadata using IMDSv2"""
133
+ token = get_instance_metadata_token()
134
+ if not token:
135
+ return None
136
+
137
+ try:
138
+ response = requests.get(
139
+ "http://169.254.169.254/latest/meta-data/iam/security-credentials/",
140
+ headers={"X-aws-ec2-metadata-token": token},
141
+ timeout=1,
142
+ )
143
+ if response.status_code == 200:
144
+ role_name = response.text.strip()
145
+ if role_name:
146
+ sts = boto3.client("sts")
147
+ identity = sts.get_caller_identity()
148
+ return identity.get("Arn")
149
+ except Exception as e:
150
+ logger.debug(f"Failed to get instance role ARN: {e}")
151
+ return None
152
+
153
+
154
+ def get_lambda_role_arn() -> Optional[str]:
155
+ """Get the Lambda function's role ARN"""
156
+ try:
157
+ function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
158
+ if not function_name:
159
+ return None
160
+
161
+ lambda_client = boto3.client("lambda")
162
+ function_config = lambda_client.get_function_configuration(
163
+ FunctionName=function_name
164
+ )
165
+ return function_config.get("Role")
166
+ except Exception as e:
167
+ logger.debug(f"Failed to get Lambda role ARN: {e}")
168
+ return None
169
+
170
+
171
+ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
172
+ """
173
+ Get the current role ARN and source type based on the runtime environment.
174
+ Returns (role_arn, credential_source)
175
+ """
176
+ env = detect_aws_environment()
177
+
178
+ if env == AwsEnvironment.LAMBDA:
179
+ role_arn = get_lambda_role_arn()
180
+ return role_arn, AwsServicePrincipal.LAMBDA.value
181
+
182
+ elif env == AwsEnvironment.EKS:
183
+ role_arn = os.getenv("AWS_ROLE_ARN")
184
+ return role_arn, AwsServicePrincipal.EKS.value
185
+
186
+ elif env == AwsEnvironment.APP_RUNNER:
187
+ try:
188
+ sts = boto3.client("sts")
189
+ identity = sts.get_caller_identity()
190
+ return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value
191
+ except Exception as e:
192
+ logger.debug(f"Failed to get App Runner role: {e}")
193
+
194
+ elif env == AwsEnvironment.ECS:
195
+ try:
196
+ metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
197
+ "ECS_CONTAINER_METADATA_URI"
198
+ )
199
+ if metadata_uri:
200
+ response = requests.get(f"{metadata_uri}/task", timeout=1)
201
+ if response.status_code == HTTPStatus.OK:
202
+ task_metadata = response.json()
203
+ if "TaskARN" in task_metadata:
204
+ return (
205
+ task_metadata.get("TaskARN"),
206
+ AwsServicePrincipal.ECS.value,
207
+ )
208
+ except Exception as e:
209
+ logger.debug(f"Failed to get ECS task role: {e}")
210
+
211
+ elif env == AwsEnvironment.BEANSTALK:
212
+ # Beanstalk uses EC2 instance metadata
213
+ return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value
214
+
215
+ elif env == AwsEnvironment.EC2:
216
+ return get_instance_role_arn(), AwsServicePrincipal.EC2.value
217
+
218
+ return None, None
219
+
220
+
37
221
  def assume_role(
38
222
  role: AwsAssumeRoleConfig,
39
223
  aws_region: Optional[str],
@@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel):
95
279
  )
96
280
  aws_profile: Optional[str] = Field(
97
281
  default=None,
98
- description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used",
282
+ description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.",
99
283
  )
100
284
  aws_region: Optional[str] = Field(None, description="AWS region code.")
101
285
 
@@ -145,6 +329,7 @@ class AwsConnectionConfig(ConfigModel):
145
329
 
146
330
  def get_session(self) -> Session:
147
331
  if self.aws_access_key_id and self.aws_secret_access_key:
332
+ # Explicit credentials take precedence
148
333
  session = Session(
149
334
  aws_access_key_id=self.aws_access_key_id,
150
335
  aws_secret_access_key=self.aws_secret_access_key,
@@ -152,38 +337,57 @@ class AwsConnectionConfig(ConfigModel):
152
337
  region_name=self.aws_region,
153
338
  )
154
339
  elif self.aws_profile:
340
+ # Named profile is second priority
155
341
  session = Session(
156
342
  region_name=self.aws_region, profile_name=self.aws_profile
157
343
  )
158
344
  else:
159
- # Use boto3's credential autodetection.
345
+ # Use boto3's credential autodetection
160
346
  session = Session(region_name=self.aws_region)
161
347
 
162
- if self._normalized_aws_roles():
163
- # Use existing session credentials to start the chain of role assumption.
164
- current_credentials = session.get_credentials()
165
- credentials = {
166
- "AccessKeyId": current_credentials.access_key,
167
- "SecretAccessKey": current_credentials.secret_key,
168
- "SessionToken": current_credentials.token,
169
- }
170
-
171
- for role in self._normalized_aws_roles():
172
- if self._should_refresh_credentials():
173
- credentials = assume_role(
174
- role,
175
- self.aws_region,
176
- credentials=credentials,
348
+ target_roles = self._normalized_aws_roles()
349
+ if target_roles:
350
+ current_role_arn, credential_source = get_current_identity()
351
+
352
+ # Only assume role if:
353
+ # 1. We're not in a known AWS environment with a role, or
354
+ # 2. We need to assume a different role than our current one
355
+ should_assume_role = current_role_arn is None or any(
356
+ role.RoleArn != current_role_arn for role in target_roles
357
+ )
358
+
359
+ if should_assume_role:
360
+ env = detect_aws_environment()
361
+ logger.debug(f"Assuming role(s) from {env.value} environment")
362
+
363
+ current_credentials = session.get_credentials()
364
+ if current_credentials is None:
365
+ raise ValueError("No credentials available for role assumption")
366
+
367
+ credentials = {
368
+ "AccessKeyId": current_credentials.access_key,
369
+ "SecretAccessKey": current_credentials.secret_key,
370
+ "SessionToken": current_credentials.token,
371
+ }
372
+
373
+ for role in target_roles:
374
+ if self._should_refresh_credentials():
375
+ credentials = assume_role(
376
+ role=role,
377
+ aws_region=self.aws_region,
378
+ credentials=credentials,
379
+ )
380
+ if isinstance(credentials["Expiration"], datetime):
381
+ self._credentials_expiration = credentials["Expiration"]
382
+
383
+ session = Session(
384
+ aws_access_key_id=credentials["AccessKeyId"],
385
+ aws_secret_access_key=credentials["SecretAccessKey"],
386
+ aws_session_token=credentials["SessionToken"],
387
+ region_name=self.aws_region,
177
388
  )
178
- if isinstance(credentials["Expiration"], datetime):
179
- self._credentials_expiration = credentials["Expiration"]
180
-
181
- session = Session(
182
- aws_access_key_id=credentials["AccessKeyId"],
183
- aws_secret_access_key=credentials["SecretAccessKey"],
184
- aws_session_token=credentials["SessionToken"],
185
- region_name=self.aws_region,
186
- )
389
+ else:
390
+ logger.debug(f"Using existing role from {credential_source}")
187
391
 
188
392
  return session
189
393
 
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
52
52
  platform_name,
53
53
  support_status,
54
54
  )
55
+ from datahub.ingestion.api.report import EntityFilterReport
55
56
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
56
57
  from datahub.ingestion.api.workunit import MetadataWorkUnit
57
58
  from datahub.ingestion.source.aws import s3_util
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
115
116
 
116
117
  logger = logging.getLogger(__name__)
117
118
 
118
-
119
119
  DEFAULT_PLATFORM = "glue"
120
120
  VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
121
121
 
@@ -220,6 +220,7 @@ class GlueSourceConfig(
220
220
  class GlueSourceReport(StaleEntityRemovalSourceReport):
221
221
  tables_scanned = 0
222
222
  filtered: List[str] = dataclass_field(default_factory=list)
223
+ databases: EntityFilterReport = EntityFilterReport.field(type="database")
223
224
 
224
225
  num_job_script_location_missing: int = 0
225
226
  num_job_script_location_invalid: int = 0
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
668
669
  return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
669
670
 
670
671
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
672
+ logger.debug("Getting all databases")
671
673
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
672
674
  paginator = self.glue_client.get_paginator("get_databases")
673
675
 
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
684
686
  pattern += "[?!TargetDatabase]"
685
687
 
686
688
  for database in paginator_response.search(pattern):
687
- if self.source_config.database_pattern.allowed(database["Name"]):
689
+ if (not self.source_config.database_pattern.allowed(database["Name"])) or (
690
+ self.source_config.catalog_id
691
+ and database.get("CatalogId")
692
+ and database.get("CatalogId") != self.source_config.catalog_id
693
+ ):
694
+ self.report.databases.dropped(database["Name"])
695
+ else:
696
+ self.report.databases.processed(database["Name"])
688
697
  yield database
689
698
 
690
699
  def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
700
+ logger.debug(f"Getting tables from database {database['Name']}")
691
701
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
692
702
  paginator = self.glue_client.get_paginator("get_tables")
693
703
  database_name = database["Name"]
@@ -206,9 +206,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
206
206
 
207
207
  def _init_schema_resolver(self) -> SchemaResolver:
208
208
  schema_resolution_required = (
209
- self.config.use_queries_v2
210
- or self.config.lineage_parse_view_ddl
211
- or self.config.lineage_use_sql_parser
209
+ self.config.use_queries_v2 or self.config.lineage_use_sql_parser
212
210
  )
213
211
  schema_ingestion_enabled = (
214
212
  self.config.include_schema_metadata
@@ -255,18 +253,16 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
255
253
  for project in projects:
256
254
  yield from self.bq_schema_extractor.get_project_workunits(project)
257
255
 
258
- if self.config.use_queries_v2:
259
- # Always ingest View and Snapshot lineage with schema ingestion
260
- self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
261
-
262
- yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
263
- [p.id for p in projects],
264
- self.bq_schema_extractor.view_refs_by_project,
265
- self.bq_schema_extractor.view_definitions,
266
- self.bq_schema_extractor.snapshot_refs_by_project,
267
- self.bq_schema_extractor.snapshots_by_ref,
268
- )
256
+ self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
257
+ yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
258
+ [p.id for p in projects],
259
+ self.bq_schema_extractor.view_refs_by_project,
260
+ self.bq_schema_extractor.view_definitions,
261
+ self.bq_schema_extractor.snapshot_refs_by_project,
262
+ self.bq_schema_extractor.snapshots_by_ref,
263
+ )
269
264
 
265
+ if self.config.use_queries_v2:
270
266
  # if both usage and lineage are disabled then skip queries extractor piece
271
267
  if (
272
268
  not self.config.include_usage_statistics
@@ -306,10 +302,6 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
306
302
  if self.config.include_table_lineage:
307
303
  yield from self.lineage_extractor.get_lineage_workunits(
308
304
  [p.id for p in projects],
309
- self.bq_schema_extractor.view_refs_by_project,
310
- self.bq_schema_extractor.view_definitions,
311
- self.bq_schema_extractor.snapshot_refs_by_project,
312
- self.bq_schema_extractor.snapshots_by_ref,
313
305
  self.bq_schema_extractor.table_refs,
314
306
  )
315
307
 
@@ -463,10 +463,6 @@ class BigQueryV2Config(
463
463
  default=True,
464
464
  description="Use sql parser to resolve view/table lineage.",
465
465
  )
466
- lineage_parse_view_ddl: bool = Field(
467
- default=True,
468
- description="Sql parse view ddl to get lineage.",
469
- )
470
466
 
471
467
  lineage_sql_parser_use_raw_names: bool = Field(
472
468
  default=False,
@@ -572,11 +568,9 @@ class BigQueryV2Config(
572
568
  "See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.",
573
569
  )
574
570
 
575
- # include_view_lineage and include_view_column_lineage are inherited from SQLCommonConfig
576
- # but not used in bigquery so we hide them from docs.
577
- include_view_lineage: bool = Field(default=True, hidden_from_docs=True)
578
-
579
- include_view_column_lineage: bool = Field(default=True, hidden_from_docs=True)
571
+ _include_view_lineage = pydantic_removed_field("include_view_lineage")
572
+ _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
573
+ _lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
580
574
 
581
575
  @root_validator(pre=True)
582
576
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass, field
4
- from datetime import datetime, timezone
4
+ from datetime import datetime
5
5
  from functools import lru_cache
6
6
  from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional
7
7
 
@@ -15,6 +15,7 @@ from google.cloud.bigquery.table import (
15
15
  TimePartitioningType,
16
16
  )
17
17
 
18
+ from datahub.emitter.mce_builder import parse_ts_millis
18
19
  from datahub.ingestion.api.source import SourceReport
19
20
  from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
20
21
  from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
@@ -393,13 +394,7 @@ class BigQuerySchemaApi:
393
394
  name=table.table_name,
394
395
  created=table.created,
395
396
  table_type=table.table_type,
396
- last_altered=(
397
- datetime.fromtimestamp(
398
- table.get("last_altered") / 1000, tz=timezone.utc
399
- )
400
- if table.get("last_altered") is not None
401
- else None
402
- ),
397
+ last_altered=parse_ts_millis(table.get("last_altered")),
403
398
  size_in_bytes=table.get("bytes"),
404
399
  rows_count=table.get("row_count"),
405
400
  comment=table.comment,
@@ -460,11 +455,7 @@ class BigQuerySchemaApi:
460
455
  return BigqueryView(
461
456
  name=view.table_name,
462
457
  created=view.created,
463
- last_altered=(
464
- datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
465
- if view.get("last_altered") is not None
466
- else None
467
- ),
458
+ last_altered=(parse_ts_millis(view.get("last_altered"))),
468
459
  comment=view.comment,
469
460
  view_definition=view.view_definition,
470
461
  materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
@@ -705,13 +696,7 @@ class BigQuerySchemaApi:
705
696
  return BigqueryTableSnapshot(
706
697
  name=snapshot.table_name,
707
698
  created=snapshot.created,
708
- last_altered=(
709
- datetime.fromtimestamp(
710
- snapshot.get("last_altered") / 1000, tz=timezone.utc
711
- )
712
- if snapshot.get("last_altered") is not None
713
- else None
714
- ),
699
+ last_altered=parse_ts_millis(snapshot.get("last_altered")),
715
700
  comment=snapshot.comment,
716
701
  ddl=snapshot.ddl,
717
702
  snapshot_time=snapshot.snapshot_time,
@@ -653,14 +653,11 @@ class BigQuerySchemaGenerator:
653
653
  self.report.report_dropped(table_identifier.raw_table_name())
654
654
  return
655
655
 
656
- if self.store_table_refs:
657
- table_ref = str(
658
- BigQueryTableRef(table_identifier).get_sanitized_table_ref()
659
- )
660
- self.table_refs.add(table_ref)
661
- if self.config.lineage_parse_view_ddl and view.view_definition:
662
- self.view_refs_by_project[project_id].add(table_ref)
663
- self.view_definitions[table_ref] = view.view_definition
656
+ table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
657
+ self.table_refs.add(table_ref)
658
+ if view.view_definition:
659
+ self.view_refs_by_project[project_id].add(table_ref)
660
+ self.view_definitions[table_ref] = view.view_definition
664
661
 
665
662
  view.column_count = len(columns)
666
663
  if not view.column_count:
@@ -701,14 +698,11 @@ class BigQuerySchemaGenerator:
701
698
  f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}"
702
699
  )
703
700
 
704
- if self.store_table_refs:
705
- table_ref = str(
706
- BigQueryTableRef(table_identifier).get_sanitized_table_ref()
707
- )
708
- self.table_refs.add(table_ref)
709
- if snapshot.base_table_identifier:
710
- self.snapshot_refs_by_project[project_id].add(table_ref)
711
- self.snapshots_by_ref[table_ref] = snapshot
701
+ table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
702
+ self.table_refs.add(table_ref)
703
+ if snapshot.base_table_identifier:
704
+ self.snapshot_refs_by_project[project_id].add(table_ref)
705
+ self.snapshots_by_ref[table_ref] = snapshot
712
706
 
713
707
  yield from self.gen_snapshot_dataset_workunits(
714
708
  table=snapshot,
@@ -1148,7 +1142,7 @@ class BigQuerySchemaGenerator:
1148
1142
  foreignKeys=foreign_keys if foreign_keys else None,
1149
1143
  )
1150
1144
 
1151
- if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser:
1145
+ if self.config.lineage_use_sql_parser:
1152
1146
  self.sql_parser_schema_resolver.add_schema_metadata(
1153
1147
  dataset_urn, schema_metadata
1154
1148
  )
@@ -291,16 +291,15 @@ class BigqueryLineageExtractor:
291
291
  snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
292
292
  ) -> Iterable[MetadataWorkUnit]:
293
293
  for project in projects:
294
- if self.config.lineage_parse_view_ddl:
295
- for view in view_refs_by_project[project]:
296
- self.datasets_skip_audit_log_lineage.add(view)
297
- self.aggregator.add_view_definition(
298
- view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
299
- BigQueryTableRef.from_string_name(view)
300
- ),
301
- view_definition=view_definitions[view],
302
- default_db=project,
303
- )
294
+ for view in view_refs_by_project[project]:
295
+ self.datasets_skip_audit_log_lineage.add(view)
296
+ self.aggregator.add_view_definition(
297
+ view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
298
+ BigQueryTableRef.from_string_name(view)
299
+ ),
300
+ view_definition=view_definitions[view],
301
+ default_db=project,
302
+ )
304
303
 
305
304
  for snapshot_ref in snapshot_refs_by_project[project]:
306
305
  snapshot = snapshots_by_ref[snapshot_ref]
@@ -322,23 +321,11 @@ class BigqueryLineageExtractor:
322
321
  def get_lineage_workunits(
323
322
  self,
324
323
  projects: List[str],
325
- view_refs_by_project: Dict[str, Set[str]],
326
- view_definitions: FileBackedDict[str],
327
- snapshot_refs_by_project: Dict[str, Set[str]],
328
- snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
329
324
  table_refs: Set[str],
330
325
  ) -> Iterable[MetadataWorkUnit]:
331
326
  if not self._should_ingest_lineage():
332
327
  return
333
328
 
334
- yield from self.get_lineage_workunits_for_views_and_snapshots(
335
- projects,
336
- view_refs_by_project,
337
- view_definitions,
338
- snapshot_refs_by_project,
339
- snapshots_by_ref,
340
- )
341
-
342
329
  if self.config.use_exported_bigquery_audit_metadata:
343
330
  projects = ["*"] # project_id not used when using exported metadata
344
331