acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/emitter/rest_emitter.py +16 -1
  8. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
  9. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  10. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  11. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  12. datahub/ingestion/source/kafka_connect/common.py +202 -0
  13. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  14. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  15. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  16. datahub/ingestion/source/looker/looker_common.py +54 -2
  17. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  18. datahub/ingestion/source/looker/looker_source.py +12 -1
  19. datahub/ingestion/source/mlflow.py +30 -5
  20. datahub/ingestion/source/powerbi/config.py +1 -14
  21. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  22. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  23. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
  24. datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
  25. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
  27. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  28. datahub/ingestion/source/sql/mssql/source.py +14 -0
  29. datahub/ingestion/source/tableau/tableau.py +4 -5
  30. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  31. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  32. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  33. datahub/ingestion/source/unity/source.py +4 -0
  34. datahub/ingestion/source_report/ingestion_stage.py +1 -0
  35. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  36. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  37. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  38. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -145,7 +145,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
145
145
  self.source_config: LookerDashboardSourceConfig = config
146
146
  self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
147
147
  self.looker_api: LookerAPI = LookerAPI(self.source_config)
148
- self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
148
+ self.user_registry: LookerUserRegistry = LookerUserRegistry(
149
+ self.looker_api, self.reporter
150
+ )
149
151
  self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
150
152
  self.looker_api, self.reporter, self.source_config
151
153
  )
@@ -1673,5 +1675,14 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1673
1675
  yield usage_mcp.as_workunit()
1674
1676
  self.reporter.report_stage_end("usage_extraction")
1675
1677
 
1678
+ # Dump looker user resource mappings.
1679
+ logger.info("Ingesting looker user resource mapping workunits")
1680
+ self.reporter.report_stage_start("user_resource_extraction")
1681
+ yield from auto_workunit(
1682
+ self.user_registry.to_platform_resource(
1683
+ self.source_config.platform_instance
1684
+ )
1685
+ )
1686
+
1676
1687
  def get_report(self) -> SourceReport:
1677
1688
  return self.reporter
@@ -38,16 +38,30 @@ T = TypeVar("T")
38
38
  class MLflowConfig(EnvConfigMixin):
39
39
  tracking_uri: Optional[str] = Field(
40
40
  default=None,
41
- description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)",
41
+ description=(
42
+ "Tracking server URI. If not set, an MLflow default tracking_uri is used"
43
+ " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
44
+ ),
42
45
  )
43
46
  registry_uri: Optional[str] = Field(
44
47
  default=None,
45
- description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)",
48
+ description=(
49
+ "Registry server URI. If not set, an MLflow default registry_uri is used"
50
+ " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
51
+ ),
46
52
  )
47
53
  model_name_separator: str = Field(
48
54
  default="_",
49
55
  description="A string which separates model name from its version (e.g. model_1 or model-1)",
50
56
  )
57
+ base_external_url: Optional[str] = Field(
58
+ default=None,
59
+ description=(
60
+ "Base URL to use when constructing external URLs to MLflow."
61
+ " If not set, tracking_uri is used if it's an HTTP URL."
62
+ " If neither is set, external URLs are not generated."
63
+ ),
64
+ )
51
65
 
52
66
 
53
67
  @dataclass
@@ -279,12 +293,23 @@ class MLflowSource(Source):
279
293
  )
280
294
  return urn
281
295
 
282
- def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]:
296
+ def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
297
+ if isinstance(
298
+ self.client.tracking_uri, str
299
+ ) and self.client.tracking_uri.startswith("http"):
300
+ return self.client.tracking_uri
301
+ else:
302
+ return None
303
+
304
+ def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
283
305
  """
284
306
  Generate URL for a Model Version to MLflow UI.
285
307
  """
286
- base_uri = self.client.tracking_uri
287
- if base_uri.startswith("http"):
308
+ base_uri = (
309
+ self.config.base_external_url
310
+ or self._get_base_external_url_from_tracking_uri()
311
+ )
312
+ if base_uri:
288
313
  return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
289
314
  else:
290
315
  return None
@@ -9,7 +9,7 @@ from pydantic.class_validators import root_validator
9
9
 
10
10
  import datahub.emitter.mce_builder as builder
11
11
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
- from datahub.configuration.source_common import DatasetSourceConfigMixin
12
+ from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
13
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
14
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
15
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
232
232
  return dict_
233
233
 
234
234
 
235
- class PlatformDetail(ConfigModel):
236
- platform_instance: Optional[str] = pydantic.Field(
237
- default=None,
238
- description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
239
- "with platform instance name used in ingestion "
240
- "recipe of other datahub sources.",
241
- )
242
- env: str = pydantic.Field(
243
- default=builder.DEFAULT_ENV,
244
- description="The environment that all assets produced by DataHub platform ingestion source belong to",
245
- )
246
-
247
-
248
235
  class DataBricksPlatformDetail(PlatformDetail):
249
236
  """
250
237
  metastore is an additional field used in Databricks connector to generate the dataset urn
@@ -2,8 +2,8 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Union
4
4
 
5
+ from datahub.configuration.source_common import PlatformDetail
5
6
  from datahub.ingestion.source.powerbi.config import (
6
- PlatformDetail,
7
7
  PowerBiDashboardSourceConfig,
8
8
  PowerBIPlatformDetail,
9
9
  )
@@ -5,13 +5,13 @@ from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
6
  from lark import Tree
7
7
 
8
+ from datahub.configuration.source_common import PlatformDetail
8
9
  from datahub.emitter import mce_builder as builder
9
10
  from datahub.ingestion.api.common import PipelineContext
10
11
  from datahub.ingestion.source.powerbi.config import (
11
12
  Constant,
12
13
  DataBricksPlatformDetail,
13
14
  DataPlatformPair,
14
- PlatformDetail,
15
15
  PowerBiDashboardSourceConfig,
16
16
  PowerBiDashboardSourceReport,
17
17
  PowerBIPlatformDetail,
@@ -265,64 +265,17 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
265
265
  with PerfTimer() as timer:
266
266
  self.report.num_external_table_edges_scanned = 0
267
267
 
268
- for (
269
- known_lineage_mapping
270
- ) in self._populate_external_lineage_from_copy_history(discovered_tables):
271
- self.sql_aggregator.add(known_lineage_mapping)
272
- logger.info(
273
- "Done populating external lineage from copy history. "
274
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
275
- )
276
-
277
- for (
278
- known_lineage_mapping
279
- ) in self._populate_external_lineage_from_show_query(discovered_tables):
280
- self.sql_aggregator.add(known_lineage_mapping)
281
-
282
- logger.info(
283
- "Done populating external lineage from show external tables. "
284
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far."
285
- )
268
+ for entry in self._get_copy_history_lineage(discovered_tables):
269
+ self.sql_aggregator.add(entry)
270
+ logger.info("Done populating external lineage from copy history. ")
286
271
 
287
272
  self.report.external_lineage_queries_secs = timer.elapsed_seconds()
288
273
 
289
- # Handles the case for explicitly created external tables.
290
- # NOTE: Snowflake does not log this information to the access_history table.
291
- def _populate_external_lineage_from_show_query(
292
- self, discovered_tables: List[str]
293
- ) -> Iterable[KnownLineageMapping]:
294
- external_tables_query: str = SnowflakeQuery.show_external_tables()
295
- try:
296
- for db_row in self.connection.query(external_tables_query):
297
- key = self.identifiers.get_dataset_identifier(
298
- db_row["name"], db_row["schema_name"], db_row["database_name"]
299
- )
300
-
301
- if key not in discovered_tables:
302
- continue
303
- if db_row["location"].startswith("s3://"):
304
- yield KnownLineageMapping(
305
- upstream_urn=make_s3_urn_for_lineage(
306
- db_row["location"], self.config.env
307
- ),
308
- downstream_urn=self.identifiers.gen_dataset_urn(key),
309
- )
310
- self.report.num_external_table_edges_scanned += 1
311
-
312
- self.report.num_external_table_edges_scanned += 1
313
- except Exception as e:
314
- logger.debug(e, exc_info=e)
315
- self.structured_reporter.warning(
316
- "Error populating external table lineage from Snowflake",
317
- exc=e,
318
- )
319
- self.report_status(EXTERNAL_LINEAGE, False)
320
-
321
274
  # Handles the case where a table is populated from an external stage/s3 location via copy.
322
275
  # Eg: copy into category_english from @external_s3_stage;
323
276
  # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
324
277
  # NOTE: Snowflake does not log this information to the access_history table.
325
- def _populate_external_lineage_from_copy_history(
278
+ def _get_copy_history_lineage(
326
279
  self, discovered_tables: List[str]
327
280
  ) -> Iterable[KnownLineageMapping]:
328
281
  query: str = SnowflakeQuery.copy_lineage_history(
@@ -247,9 +247,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
247
247
  for entry in self.fetch_copy_history():
248
248
  queries.append(entry)
249
249
 
250
- # TODO: Add "show external tables" lineage to the main schema extractor.
251
- # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor.
252
-
253
250
  with self.report.query_log_fetch_timer:
254
251
  for entry in self.fetch_query_log():
255
252
  queries.append(entry)
@@ -16,6 +16,7 @@ from datahub.ingestion.glossary.classification_mixin import (
16
16
  ClassificationHandler,
17
17
  classification_workunit_processor,
18
18
  )
19
+ from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
19
20
  from datahub.ingestion.source.common.subtypes import (
20
21
  DatasetContainerSubTypes,
21
22
  DatasetSubTypes,
@@ -35,6 +36,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import (
35
36
  )
36
37
  from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
37
38
  from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
39
+ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
38
40
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
39
41
  from datahub.ingestion.source.snowflake.snowflake_schema import (
40
42
  SCHEMA_PARALLELISM,
@@ -65,6 +67,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
67
  get_domain_wu,
66
68
  )
67
69
  from datahub.ingestion.source_report.ingestion_stage import (
70
+ EXTERNAL_TABLE_DDL_LINEAGE,
68
71
  METADATA_EXTRACTION,
69
72
  PROFILING,
70
73
  )
@@ -96,7 +99,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
96
99
  TimeType,
97
100
  )
98
101
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
99
- from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
102
+ from datahub.sql_parsing.sql_parsing_aggregator import (
103
+ KnownLineageMapping,
104
+ SqlParsingAggregator,
105
+ )
100
106
  from datahub.utilities.registries.domain_registry import DomainRegistry
101
107
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
102
108
 
@@ -180,7 +186,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
180
186
 
181
187
  # These are populated as side-effects of get_workunits_internal.
182
188
  self.databases: List[SnowflakeDatabase] = []
183
- self.aggregator: Optional[SqlParsingAggregator] = aggregator
189
+
190
+ self.aggregator = aggregator
184
191
 
185
192
  def get_connection(self) -> SnowflakeConnection:
186
193
  return self.connection
@@ -212,6 +219,19 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
212
219
  self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
213
220
  yield from self._process_database(snowflake_db)
214
221
 
222
+ self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE)
223
+ discovered_tables: List[str] = [
224
+ self.identifiers.get_dataset_identifier(
225
+ table_name, schema.name, db.name
226
+ )
227
+ for db in self.databases
228
+ for schema in db.schemas
229
+ for table_name in schema.tables
230
+ ]
231
+ if self.aggregator:
232
+ for entry in self._external_tables_ddl_lineage(discovered_tables):
233
+ self.aggregator.add(entry)
234
+
215
235
  except SnowflakePermissionError as e:
216
236
  self.structured_reporter.failure(
217
237
  GENERIC_PERMISSION_ERROR_KEY,
@@ -1082,3 +1102,33 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1082
1102
 
1083
1103
  # Access to table but none of its constraints - is this possible ?
1084
1104
  return constraints.get(table_name, [])
1105
+
1106
+ # Handles the case for explicitly created external tables.
1107
+ # NOTE: Snowflake does not log this information to the access_history table.
1108
+ def _external_tables_ddl_lineage(
1109
+ self, discovered_tables: List[str]
1110
+ ) -> Iterable[KnownLineageMapping]:
1111
+ external_tables_query: str = SnowflakeQuery.show_external_tables()
1112
+ try:
1113
+ for db_row in self.connection.query(external_tables_query):
1114
+ key = self.identifiers.get_dataset_identifier(
1115
+ db_row["name"], db_row["schema_name"], db_row["database_name"]
1116
+ )
1117
+
1118
+ if key not in discovered_tables:
1119
+ continue
1120
+ if db_row["location"].startswith("s3://"):
1121
+ yield KnownLineageMapping(
1122
+ upstream_urn=make_s3_urn_for_lineage(
1123
+ db_row["location"], self.config.env
1124
+ ),
1125
+ downstream_urn=self.identifiers.gen_dataset_urn(key),
1126
+ )
1127
+ self.report.num_external_table_edges_scanned += 1
1128
+
1129
+ self.report.num_external_table_edges_scanned += 1
1130
+ except Exception as e:
1131
+ self.structured_reporter.warning(
1132
+ "External table ddl lineage extraction failed",
1133
+ exc=e,
1134
+ )
@@ -161,35 +161,32 @@ class SnowflakeV2Source(
161
161
  # For database, schema, tables, views, etc
162
162
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
163
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
- self.aggregator: Optional[SqlParsingAggregator] = None
165
-
166
- if self.config.use_queries_v2 or self.config.include_table_lineage:
167
- self.aggregator = self._exit_stack.enter_context(
168
- SqlParsingAggregator(
169
- platform=self.identifiers.platform,
170
- platform_instance=self.config.platform_instance,
171
- env=self.config.env,
172
- graph=self.ctx.graph,
173
- eager_graph_load=(
174
- # If we're ingestion schema metadata for tables/views, then we will populate
175
- # schemas into the resolver as we go. We only need to do a bulk fetch
176
- # if we're not ingesting schema metadata as part of ingestion.
177
- not (
178
- self.config.include_technical_schema
179
- and self.config.include_tables
180
- and self.config.include_views
181
- )
182
- and not self.config.lazy_schema_resolver
183
- ),
184
- generate_usage_statistics=False,
185
- generate_operations=False,
186
- format_queries=self.config.format_sql_queries,
187
- )
164
+
165
+ self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
166
+ SqlParsingAggregator(
167
+ platform=self.identifiers.platform,
168
+ platform_instance=self.config.platform_instance,
169
+ env=self.config.env,
170
+ graph=self.ctx.graph,
171
+ eager_graph_load=(
172
+ # If we're ingestion schema metadata for tables/views, then we will populate
173
+ # schemas into the resolver as we go. We only need to do a bulk fetch
174
+ # if we're not ingesting schema metadata as part of ingestion.
175
+ not (
176
+ self.config.include_technical_schema
177
+ and self.config.include_tables
178
+ and self.config.include_views
179
+ )
180
+ and not self.config.lazy_schema_resolver
181
+ ),
182
+ generate_usage_statistics=False,
183
+ generate_operations=False,
184
+ format_queries=self.config.format_sql_queries,
188
185
  )
189
- self.report.sql_aggregator = self.aggregator.report
186
+ )
187
+ self.report.sql_aggregator = self.aggregator.report
190
188
 
191
189
  if self.config.include_table_lineage:
192
- assert self.aggregator is not None
193
190
  redundant_lineage_run_skip_handler: Optional[
194
191
  RedundantLineageRunSkipHandler
195
192
  ] = None
@@ -487,8 +484,6 @@ class SnowflakeV2Source(
487
484
 
488
485
  databases = schema_extractor.databases
489
486
 
490
- # TODO: The checkpoint state for stale entity detection can be committed here.
491
-
492
487
  if self.config.shares:
493
488
  yield from SnowflakeSharesHandler(
494
489
  self.config, self.report
@@ -540,6 +535,7 @@ class SnowflakeV2Source(
540
535
  identifiers=self.identifiers,
541
536
  schema_resolver=schema_resolver,
542
537
  discovered_tables=discovered_datasets,
538
+ graph=self.ctx.graph,
543
539
  )
544
540
 
545
541
  # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
@@ -1,11 +1,17 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import Dict, List, Optional, Union
3
3
 
4
- from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn
4
+ from datahub.emitter.mce_builder import (
5
+ make_data_flow_urn,
6
+ make_data_job_urn,
7
+ make_data_platform_urn,
8
+ make_dataplatform_instance_urn,
9
+ )
5
10
  from datahub.metadata.schema_classes import (
6
11
  DataFlowInfoClass,
7
12
  DataJobInfoClass,
8
13
  DataJobInputOutputClass,
14
+ DataPlatformInstanceClass,
9
15
  )
10
16
 
11
17
 
@@ -204,6 +210,18 @@ class MSSQLDataJob:
204
210
  status=self.status,
205
211
  )
206
212
 
213
+ @property
214
+ def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
215
+ if self.entity.flow.platform_instance:
216
+ return DataPlatformInstanceClass(
217
+ platform=make_data_platform_urn(self.entity.flow.orchestrator),
218
+ instance=make_dataplatform_instance_urn(
219
+ platform=self.entity.flow.orchestrator,
220
+ instance=self.entity.flow.platform_instance,
221
+ ),
222
+ )
223
+ return None
224
+
207
225
 
208
226
  @dataclass
209
227
  class MSSQLDataFlow:
@@ -238,3 +256,14 @@ class MSSQLDataFlow:
238
256
  customProperties=self.flow_properties,
239
257
  externalUrl=self.external_url,
240
258
  )
259
+
260
+ @property
261
+ def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
262
+ if self.entity.platform_instance:
263
+ return DataPlatformInstanceClass(
264
+ platform=make_data_platform_urn(self.entity.orchestrator),
265
+ instance=make_dataplatform_instance_urn(
266
+ self.entity.orchestrator, self.entity.platform_instance
267
+ ),
268
+ )
269
+ return None
@@ -639,6 +639,13 @@ class SQLServerSource(SQLAlchemySource):
639
639
  aspect=data_job.as_datajob_info_aspect,
640
640
  ).as_workunit()
641
641
 
642
+ data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
643
+ if data_platform_instance_aspect:
644
+ yield MetadataChangeProposalWrapper(
645
+ entityUrn=data_job.urn,
646
+ aspect=data_platform_instance_aspect,
647
+ ).as_workunit()
648
+
642
649
  if include_lineage:
643
650
  yield MetadataChangeProposalWrapper(
644
651
  entityUrn=data_job.urn,
@@ -654,6 +661,13 @@ class SQLServerSource(SQLAlchemySource):
654
661
  entityUrn=data_flow.urn,
655
662
  aspect=data_flow.as_dataflow_info_aspect,
656
663
  ).as_workunit()
664
+
665
+ data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
666
+ if data_platform_instance_aspect:
667
+ yield MetadataChangeProposalWrapper(
668
+ entityUrn=data_flow.urn,
669
+ aspect=data_platform_instance_aspect,
670
+ ).as_workunit()
657
671
  # TODO: Add SubType when it appear
658
672
 
659
673
  def get_inspectors(self) -> Iterable[Inspector]:
@@ -645,7 +645,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
645
645
  # the site-role might be different on another site
646
646
  logged_in_user: UserInfo = UserInfo.from_server(server=server)
647
647
 
648
- if not logged_in_user.is_site_administrator_explorer():
648
+ if not logged_in_user.has_site_administrator_explorer_privileges():
649
649
  report.warning(
650
650
  title=title,
651
651
  message=message,
@@ -896,10 +896,9 @@ class TableauSiteSource:
896
896
  return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
897
897
 
898
898
  def _re_authenticate(self):
899
- tableau_auth: Union[
900
- TableauAuth, PersonalAccessTokenAuth
901
- ] = self.config.get_tableau_auth(self.site_id)
902
- self.server.auth.sign_in(tableau_auth)
899
+ # Sign-in again may not be enough because Tableau sometimes caches invalid sessions
900
+ # so we need to recreate the Tableau Server object
901
+ self.server = self.config.make_tableau_client(self.site_id)
903
902
 
904
903
  @property
905
904
  def site_content_url(self) -> Optional[str]:
@@ -82,4 +82,6 @@ PROJECT = "Project"
82
82
  SITE = "Site"
83
83
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
84
  SITE_PERMISSION = "sitePermission"
85
- SITE_ROLE = "SiteAdministratorExplorer"
85
+ ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
86
+ ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator"
87
+ ROLE_SERVER_ADMIN = "ServerAdministrator"
@@ -11,8 +11,12 @@ class UserInfo:
11
11
  site_role: str
12
12
  site_id: str
13
13
 
14
- def is_site_administrator_explorer(self):
15
- return self.site_role == c.SITE_ROLE
14
+ def has_site_administrator_explorer_privileges(self):
15
+ return self.site_role in [
16
+ c.ROLE_SITE_ADMIN_EXPLORER,
17
+ c.ROLE_SITE_ADMIN_CREATOR,
18
+ c.ROLE_SERVER_ADMIN,
19
+ ]
16
20
 
17
21
  @staticmethod
18
22
  def from_server(server: Server) -> "UserInfo":
@@ -28,7 +28,7 @@ def check_user_role(
28
28
 
29
29
  try:
30
30
  # TODO: Add check for `Enable Derived Permissions`
31
- if not logged_in_user.is_site_administrator_explorer():
31
+ if not logged_in_user.has_site_administrator_explorer_privileges():
32
32
  capability_dict[c.SITE_PERMISSION] = CapabilityReport(
33
33
  capable=False,
34
34
  failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
@@ -26,6 +26,9 @@ from datahub.emitter.mcp_builder import (
26
26
  gen_containers,
27
27
  )
28
28
  from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
30
+ EnsureAspectSizeProcessor,
31
+ )
29
32
  from datahub.ingestion.api.common import PipelineContext
30
33
  from datahub.ingestion.api.decorators import (
31
34
  SupportStatus,
@@ -260,6 +263,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
260
263
  StaleEntityRemovalHandler.create(
261
264
  self, self.config, self.ctx
262
265
  ).workunit_processor,
266
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
263
267
  ]
264
268
 
265
269
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
@@ -14,6 +14,7 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
14
14
  USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
15
15
  USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
16
16
  USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
17
+ EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
17
18
  QUERIES_EXTRACTION = "Queries Extraction"
18
19
  PROFILING = "Profiling"
19
20
 
@@ -490,7 +490,7 @@ class SqlParsingAggregator(Closeable):
490
490
  self._exit_stack.push(self._query_usage_counts)
491
491
 
492
492
  # Tool Extractor
493
- self._tool_meta_extractor = ToolMetaExtractor()
493
+ self._tool_meta_extractor = ToolMetaExtractor.create(graph)
494
494
  self.report.tool_meta_report = self._tool_meta_extractor.report
495
495
 
496
496
  def close(self) -> None: