acryl-datahub 0.15.0.5rc7__py3-none-any.whl → 0.15.0.5rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show
  1. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/METADATA +2493 -2463
  2. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/RECORD +38 -35
  3. datahub/_version.py +1 -1
  4. datahub/cli/iceberg_cli.py +707 -0
  5. datahub/entrypoints.py +21 -0
  6. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  7. datahub/ingestion/glossary/classification_mixin.py +6 -0
  8. datahub/ingestion/glossary/classifier.py +3 -2
  9. datahub/ingestion/source/aws/glue.py +3 -2
  10. datahub/ingestion/source/identity/azure_ad.py +6 -14
  11. datahub/ingestion/source/mode.py +2 -4
  12. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  13. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  14. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  15. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  16. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -36
  17. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  18. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  19. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  20. datahub/ingestion/source/sql/mssql/source.py +17 -0
  21. datahub/ingestion/source/tableau/tableau.py +14 -12
  22. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  23. datahub/metadata/_schema_classes.py +160 -2
  24. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  25. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  26. datahub/metadata/schema.avsc +96 -7
  27. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  28. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  29. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  30. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  31. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  32. datahub/metadata/schemas/MetadataChangeEvent.avsc +5 -5
  33. datahub/specific/dashboard.py +43 -1
  34. datahub/upgrade/upgrade.py +13 -5
  35. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/LICENSE +0 -0
  36. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/WHEEL +0 -0
  37. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/entry_points.txt +0 -0
  38. {acryl_datahub-0.15.0.5rc7.dist-info → acryl_datahub-0.15.0.5rc9.dist-info}/top_level.txt +0 -0
datahub/entrypoints.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import multiprocessing
2
3
  import os
3
4
  import platform
4
5
  import sys
@@ -183,6 +184,18 @@ datahub.add_command(datacontract)
183
184
  datahub.add_command(assertions)
184
185
  datahub.add_command(container)
185
186
 
187
+ try:
188
+ from datahub.cli.iceberg_cli import iceberg
189
+
190
+ datahub.add_command(iceberg)
191
+ except ImportError as e:
192
+ logger.debug(f"Failed to load datahub iceberg command: {e}")
193
+ datahub.add_command(
194
+ make_shim_command(
195
+ "iceberg", "run `pip install 'acryl-datahub[iceberg-catalog]'`"
196
+ )
197
+ )
198
+
186
199
  try:
187
200
  from datahub.cli.lite_cli import lite
188
201
 
@@ -205,6 +218,14 @@ except ImportError as e:
205
218
 
206
219
 
207
220
  def main(**kwargs):
221
+ # We use threads in a variety of places within our CLI. The multiprocessing
222
+ # "fork" start method is not safe to use with threads.
223
+ # MacOS and Windows already default to "spawn", and Linux will as well starting in Python 3.14.
224
+ # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
225
+ # Eventually it may make sense to use "forkserver" as the default where available,
226
+ # but we can revisit that in the future.
227
+ multiprocessing.set_start_method("spawn", force=True)
228
+
208
229
  # This wrapper prevents click from suppressing errors.
209
230
  try:
210
231
  sys.exit(datahub(standalone_mode=False, **kwargs))
@@ -102,6 +102,10 @@ def convert_dashboard_info_to_patch(
102
102
  if aspect.datasets:
103
103
  patch_builder.add_datasets(aspect.datasets)
104
104
 
105
+ if aspect.dashboards:
106
+ for dashboard in aspect.dashboards:
107
+ patch_builder.add_dashboard(dashboard)
108
+
105
109
  if aspect.access:
106
110
  patch_builder.set_access(aspect.access)
107
111
 
@@ -1,5 +1,6 @@
1
1
  import concurrent.futures
2
2
  import logging
3
+ import multiprocessing
3
4
  from dataclasses import dataclass, field
4
5
  from functools import partial
5
6
  from math import ceil
@@ -182,6 +183,11 @@ class ClassificationHandler:
182
183
 
183
184
  with concurrent.futures.ProcessPoolExecutor(
184
185
  max_workers=self.config.classification.max_workers,
186
+ # The fork start method, which is the default on Linux for Python < 3.14, is not
187
+ # safe when the main process uses threads. The default start method on windows/macOS is
188
+ # already spawn, and will be changed to spawn for Linux in Python 3.14.
189
+ # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
190
+ mp_context=multiprocessing.get_context("spawn"),
185
191
  ) as executor:
186
192
  column_info_proposal_futures = [
187
193
  executor.submit(
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from abc import ABCMeta, abstractmethod
2
3
  from dataclasses import dataclass
3
4
  from typing import Any, Dict, List, Optional
@@ -37,8 +38,8 @@ class ClassificationConfig(ConfigModel):
37
38
  )
38
39
 
39
40
  max_workers: int = Field(
40
- default=1,
41
- description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
41
+ default=(os.cpu_count() or 4),
42
+ description="Number of worker processes to use for classification. Set to 1 to disable.",
42
43
  )
43
44
 
44
45
  table_pattern: AllowDenyPattern = Field(
@@ -113,6 +113,7 @@ from datahub.metadata.schema_classes import (
113
113
  )
114
114
  from datahub.utilities.delta import delta_type_to_hive_type
115
115
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
116
+ from datahub.utilities.lossy_collections import LossyList
116
117
 
117
118
  logger = logging.getLogger(__name__)
118
119
 
@@ -220,7 +221,7 @@ class GlueSourceConfig(
220
221
  class GlueSourceReport(StaleEntityRemovalSourceReport):
221
222
  catalog_id: Optional[str] = None
222
223
  tables_scanned = 0
223
- filtered: List[str] = dataclass_field(default_factory=list)
224
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
224
225
  databases: EntityFilterReport = EntityFilterReport.field(type="database")
225
226
 
226
227
  num_job_script_location_missing: int = 0
@@ -746,7 +747,7 @@ class GlueSource(StatefulIngestionSourceBase):
746
747
  for tables in self.get_tables_from_database(database):
747
748
  all_tables.append(tables)
748
749
  except Exception as e:
749
- self.report.failure(
750
+ self.report.warning(
750
751
  message="Failed to get tables from database",
751
752
  context=database["Name"],
752
753
  exc=e,
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
13
13
 
14
14
  from datahub.configuration.common import AllowDenyPattern
15
15
  from datahub.configuration.source_common import DatasetSourceConfigMixin
16
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
16
17
  from datahub.emitter.mce_builder import make_group_urn, make_user_urn
17
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
19
  from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
51
52
  OriginTypeClass,
52
53
  StatusClass,
53
54
  )
55
+ from datahub.utilities.lossy_collections import LossyList
54
56
 
55
57
  logger = logging.getLogger(__name__)
56
58
 
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
132
134
  description="regex patterns for groups to include in ingestion.",
133
135
  )
134
136
 
135
- # If enabled, report will contain names of filtered users and groups.
136
- filtered_tracking: bool = Field(
137
- default=True,
138
- description="If enabled, report will contain names of filtered users and groups.",
139
- )
137
+ _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
140
138
 
141
139
  # Optional: Whether to mask sensitive information from workunit ID's. On by default.
142
140
  mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
156
154
 
157
155
  @dataclass
158
156
  class AzureADSourceReport(StaleEntityRemovalSourceReport):
159
- filtered: List[str] = field(default_factory=list)
160
- filtered_tracking: bool = field(default=True, repr=False)
161
- filtered_count: int = field(default=0)
157
+ filtered: LossyList[str] = field(default_factory=LossyList)
162
158
 
163
159
  def report_filtered(self, name: str) -> None:
164
- self.filtered_count += 1
165
- if self.filtered_tracking:
166
- self.filtered.append(name)
160
+ self.filtered.append(name)
167
161
 
168
162
 
169
163
  # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
266
260
  def __init__(self, config: AzureADConfig, ctx: PipelineContext):
267
261
  super().__init__(config, ctx)
268
262
  self.config = config
269
- self.report = AzureADSourceReport(
270
- filtered_tracking=self.config.filtered_tracking
271
- )
263
+ self.report = AzureADSourceReport()
272
264
  session = requests.Session()
273
265
  retries = Retry(
274
266
  total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
@@ -24,6 +24,7 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
24
24
  import datahub.emitter.mce_builder as builder
25
25
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
26
26
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
27
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
27
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
29
  from datahub.emitter.mcp_builder import (
29
30
  ContainerKey,
@@ -155,10 +156,7 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
155
156
  workspace: str = Field(
156
157
  description="The Mode workspace name. Find it in Settings > Workspace > Details."
157
158
  )
158
- default_schema: str = Field(
159
- default="public",
160
- description="Default schema to use when schema is not provided in an SQL query",
161
- )
159
+ _default_schema = pydantic_removed_field("default_schema")
162
160
 
163
161
  space_pattern: AllowDenyPattern = Field(
164
162
  default=AllowDenyPattern(
@@ -249,6 +249,12 @@ class SnowflakeV2Config(
249
249
  description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
250
250
  )
251
251
 
252
+ structured_properties_template_cache_invalidation_interval: int = Field(
253
+ hidden_from_docs=True,
254
+ default=60,
255
+ description="Interval in seconds to invalidate the structured properties template cache.",
256
+ )
257
+
252
258
  include_external_url: bool = Field(
253
259
  default=True,
254
260
  description="Whether to populate Snowsight url for Snowflake Objects",
@@ -302,6 +308,13 @@ class SnowflakeV2Config(
302
308
  " assertions CLI in snowflake",
303
309
  )
304
310
 
311
+ pushdown_deny_usernames: List[str] = Field(
312
+ default=[],
313
+ description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
314
+ "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
315
+ "Only applicable if `use_queries_v2` is enabled.",
316
+ )
317
+
305
318
  @validator("convert_urns_to_lowercase")
306
319
  def validate_convert_urns_to_lowercase(cls, v):
307
320
  if not v:
@@ -159,6 +159,17 @@ class SnowflakeQuery:
159
159
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
160
160
  order by table_schema, table_name"""
161
161
 
162
+ @staticmethod
163
+ def get_all_tags():
164
+ return """
165
+ SELECT tag_database as "TAG_DATABASE",
166
+ tag_schema AS "TAG_SCHEMA",
167
+ tag_name AS "TAG_NAME",
168
+ FROM snowflake.account_usage.tag_references
169
+ GROUP BY TAG_DATABASE , TAG_SCHEMA, tag_name
170
+ ORDER BY TAG_DATABASE, TAG_SCHEMA, TAG_NAME ASC;
171
+ """
172
+
162
173
  @staticmethod
163
174
  def get_all_tags_on_object_with_propagation(
164
175
  db_name: str, quoted_identifier: str, domain: str
@@ -114,6 +114,7 @@ class SnowflakeV2Report(
114
114
  num_tables_with_known_upstreams: int = 0
115
115
  num_upstream_lineage_edge_parsing_failed: int = 0
116
116
  num_secure_views_missing_definition: int = 0
117
+ num_structured_property_templates_created: int = 0
117
118
 
118
119
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
119
120
 
@@ -285,6 +285,23 @@ class SnowflakeDataDictionary(SupportsAsObj):
285
285
 
286
286
  return secure_view_definitions
287
287
 
288
+ def get_all_tags(self) -> List[SnowflakeTag]:
289
+ cur = self.connection.query(
290
+ SnowflakeQuery.get_all_tags(),
291
+ )
292
+
293
+ tags = [
294
+ SnowflakeTag(
295
+ database=tag["TAG_DATABASE"],
296
+ schema=tag["TAG_SCHEMA"],
297
+ name=tag["TAG_NAME"],
298
+ value="",
299
+ )
300
+ for tag in cur
301
+ ]
302
+
303
+ return tags
304
+
288
305
  @serialized_lru_cache(maxsize=1)
289
306
  def get_tables_for_database(
290
307
  self, db_name: str
@@ -1,10 +1,10 @@
1
1
  import itertools
2
2
  import logging
3
+ import time
3
4
  from typing import Dict, Iterable, List, Optional, Union
4
5
 
5
6
  from datahub.configuration.pattern_utils import is_schema_allowed
6
7
  from datahub.emitter.mce_builder import (
7
- get_sys_time,
8
8
  make_data_platform_urn,
9
9
  make_dataset_urn_with_platform_instance,
10
10
  make_schema_field_urn,
@@ -74,7 +74,6 @@ from datahub.ingestion.source_report.ingestion_stage import (
74
74
  PROFILING,
75
75
  )
76
76
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
77
- AuditStamp,
78
77
  GlobalTags,
79
78
  Status,
80
79
  SubTypes,
@@ -101,15 +100,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
101
100
  StringType,
102
101
  TimeType,
103
102
  )
104
- from datahub.metadata.com.linkedin.pegasus2avro.structured import (
105
- StructuredPropertyDefinition,
106
- )
107
103
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
108
104
  from datahub.metadata.urns import (
109
- ContainerUrn,
110
- DatasetUrn,
111
- DataTypeUrn,
112
- EntityTypeUrn,
113
105
  SchemaFieldUrn,
114
106
  StructuredPropertyUrn,
115
107
  )
@@ -191,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
191
183
  self.domain_registry: Optional[DomainRegistry] = domain_registry
192
184
  self.classification_handler = ClassificationHandler(self.config, self.report)
193
185
  self.tag_extractor = SnowflakeTagExtractor(
194
- config, self.data_dictionary, self.report
186
+ config, self.data_dictionary, self.report, identifiers
195
187
  )
196
188
  self.profiler: Optional[SnowflakeProfiler] = profiler
197
189
  self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
@@ -217,6 +209,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
217
209
  return self.identifiers.snowflake_identifier(identifier)
218
210
 
219
211
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
212
+ if self.config.extract_tags_as_structured_properties:
213
+ logger.info("Creating structured property templates for tags")
214
+ yield from self.tag_extractor.create_structured_property_templates()
215
+ # We have to wait until cache invalidates to make sure the structured property template is available
216
+ logger.info(
217
+ f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
218
+ )
219
+ time.sleep(
220
+ self.config.structured_properties_template_cache_invalidation_interval
221
+ )
220
222
  self.databases = []
221
223
  for database in self.get_databases() or []:
222
224
  self.report.report_entity_scanned(database.name, "database")
@@ -698,6 +700,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
698
700
 
699
701
  def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
700
702
  use_sp = self.config.extract_tags_as_structured_properties
703
+
701
704
  identifier = (
702
705
  self.snowflake_identifier(tag.structured_property_identifier())
703
706
  if use_sp
@@ -708,10 +711,11 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
708
711
  return
709
712
 
710
713
  self.report.report_tag_processed(identifier)
714
+
711
715
  if use_sp:
712
- yield from self.gen_tag_as_structured_property_workunits(tag)
713
- else:
714
- yield from self.gen_tag_workunits(tag)
716
+ return
717
+
718
+ yield from self.gen_tag_workunits(tag)
715
719
 
716
720
  def _format_tags_as_structured_properties(
717
721
  self, tags: List[SnowflakeTag]
@@ -732,6 +736,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
732
736
  if table.tags:
733
737
  for tag in table.tags:
734
738
  yield from self._process_tag(tag)
739
+
735
740
  for column_name in table.column_tags:
736
741
  for tag in table.column_tags[column_name]:
737
742
  yield from self._process_tag(tag)
@@ -903,29 +908,6 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
903
908
  entityUrn=tag_urn, aspect=tag_properties_aspect
904
909
  ).as_workunit()
905
910
 
906
- def gen_tag_as_structured_property_workunits(
907
- self, tag: SnowflakeTag
908
- ) -> Iterable[MetadataWorkUnit]:
909
- identifier = self.snowflake_identifier(tag.structured_property_identifier())
910
- urn = StructuredPropertyUrn(identifier).urn()
911
- aspect = StructuredPropertyDefinition(
912
- qualifiedName=identifier,
913
- displayName=tag.name,
914
- valueType=DataTypeUrn("datahub.string").urn(),
915
- entityTypes=[
916
- EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
917
- EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
918
- EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
919
- ],
920
- lastModified=AuditStamp(
921
- time=get_sys_time(), actor="urn:li:corpuser:datahub"
922
- ),
923
- )
924
- yield MetadataChangeProposalWrapper(
925
- entityUrn=urn,
926
- aspect=aspect,
927
- ).as_workunit()
928
-
929
911
  def gen_column_tags_as_structured_properties(
930
912
  self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
931
913
  ) -> Iterable[MetadataWorkUnit]:
@@ -1,6 +1,9 @@
1
1
  import logging
2
- from typing import Dict, List, Optional
2
+ from typing import Dict, Iterable, List, Optional
3
3
 
4
+ from datahub.emitter.mce_builder import get_sys_time
5
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
6
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
4
7
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
5
8
  from datahub.ingestion.source.snowflake.snowflake_config import (
6
9
  SnowflakeV2Config,
@@ -12,7 +15,22 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
12
15
  SnowflakeTag,
13
16
  _SnowflakeTagCache,
14
17
  )
15
- from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
18
+ from datahub.ingestion.source.snowflake.snowflake_utils import (
19
+ SnowflakeCommonMixin,
20
+ SnowflakeIdentifierBuilder,
21
+ )
22
+ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
+ from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
+ StructuredPropertyDefinition,
25
+ )
26
+ from datahub.metadata.urns import (
27
+ ContainerUrn,
28
+ DatasetUrn,
29
+ DataTypeUrn,
30
+ EntityTypeUrn,
31
+ SchemaFieldUrn,
32
+ StructuredPropertyUrn,
33
+ )
16
34
 
17
35
  logger: logging.Logger = logging.getLogger(__name__)
18
36
 
@@ -23,11 +41,12 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
23
41
  config: SnowflakeV2Config,
24
42
  data_dictionary: SnowflakeDataDictionary,
25
43
  report: SnowflakeV2Report,
44
+ snowflake_identifiers: SnowflakeIdentifierBuilder,
26
45
  ) -> None:
27
46
  self.config = config
28
47
  self.data_dictionary = data_dictionary
29
48
  self.report = report
30
-
49
+ self.snowflake_identifiers = snowflake_identifiers
31
50
  self.tag_cache: Dict[str, _SnowflakeTagCache] = {}
32
51
 
33
52
  def _get_tags_on_object_without_propagation(
@@ -59,6 +78,41 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
59
78
  raise ValueError(f"Unknown domain {domain}")
60
79
  return tags
61
80
 
81
+ def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
+ for tag in self.data_dictionary.get_all_tags():
83
+ if not self.config.structured_property_pattern.allowed(
84
+ tag.tag_identifier()
85
+ ):
86
+ continue
87
+ if self.config.extract_tags_as_structured_properties:
88
+ self.report.num_structured_property_templates_created += 1
89
+ yield from self.gen_tag_as_structured_property_workunits(tag)
90
+
91
+ def gen_tag_as_structured_property_workunits(
92
+ self, tag: SnowflakeTag
93
+ ) -> Iterable[MetadataWorkUnit]:
94
+ identifier = self.snowflake_identifiers.snowflake_identifier(
95
+ tag.structured_property_identifier()
96
+ )
97
+ urn = StructuredPropertyUrn(identifier).urn()
98
+ aspect = StructuredPropertyDefinition(
99
+ qualifiedName=identifier,
100
+ displayName=tag.name,
101
+ valueType=DataTypeUrn("datahub.string").urn(),
102
+ entityTypes=[
103
+ EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
104
+ EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
105
+ EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
106
+ ],
107
+ lastModified=AuditStamp(
108
+ time=get_sys_time(), actor="urn:li:corpuser:datahub"
109
+ ),
110
+ )
111
+ yield MetadataChangeProposalWrapper(
112
+ entityUrn=urn,
113
+ aspect=aspect,
114
+ ).as_workunit()
115
+
62
116
  def _get_tags_on_object_with_propagation(
63
117
  self,
64
118
  domain: str,
@@ -567,6 +567,7 @@ class SnowflakeV2Source(
567
567
  include_queries=self.config.include_queries,
568
568
  include_query_usage_statistics=self.config.include_query_usage_statistics,
569
569
  user_email_pattern=self.config.user_email_pattern,
570
+ pushdown_deny_usernames=self.config.pushdown_deny_usernames,
570
571
  ),
571
572
  structured_report=self.report,
572
573
  filters=self.filters,
@@ -7,7 +7,12 @@ from datahub.emitter.mce_builder import (
7
7
  make_data_platform_urn,
8
8
  make_dataplatform_instance_urn,
9
9
  )
10
+ from datahub.emitter.mcp_builder import (
11
+ DatabaseKey,
12
+ SchemaKey,
13
+ )
10
14
  from datahub.metadata.schema_classes import (
15
+ ContainerClass,
11
16
  DataFlowInfoClass,
12
17
  DataJobInfoClass,
13
18
  DataJobInputOutputClass,
@@ -171,11 +176,7 @@ class MSSQLDataJob:
171
176
  flow_id=self.entity.flow.formatted_name,
172
177
  job_id=self.entity.formatted_name,
173
178
  cluster=self.entity.flow.cluster,
174
- platform_instance=(
175
- self.entity.flow.platform_instance
176
- if self.entity.flow.platform_instance
177
- else None
178
- ),
179
+ platform_instance=self.entity.flow.platform_instance,
179
180
  )
180
181
 
181
182
  def add_property(
@@ -222,6 +223,26 @@ class MSSQLDataJob:
222
223
  )
223
224
  return None
224
225
 
226
+ @property
227
+ def as_container_aspect(self) -> ContainerClass:
228
+ key_args = dict(
229
+ platform=self.entity.flow.orchestrator,
230
+ instance=self.entity.flow.platform_instance,
231
+ env=self.entity.flow.env,
232
+ database=self.entity.flow.db,
233
+ )
234
+ container_key = (
235
+ SchemaKey(
236
+ schema=self.entity.schema,
237
+ **key_args,
238
+ )
239
+ if isinstance(self.entity, StoredProcedure)
240
+ else DatabaseKey(
241
+ **key_args,
242
+ )
243
+ )
244
+ return ContainerClass(container=container_key.as_urn())
245
+
225
246
 
226
247
  @dataclass
227
248
  class MSSQLDataFlow:
@@ -244,9 +265,7 @@ class MSSQLDataFlow:
244
265
  orchestrator=self.entity.orchestrator,
245
266
  flow_id=self.entity.formatted_name,
246
267
  cluster=self.entity.cluster,
247
- platform_instance=(
248
- self.entity.platform_instance if self.entity.platform_instance else None
249
- ),
268
+ platform_instance=self.entity.platform_instance,
250
269
  )
251
270
 
252
271
  @property
@@ -267,3 +286,13 @@ class MSSQLDataFlow:
267
286
  ),
268
287
  )
269
288
  return None
289
+
290
+ @property
291
+ def as_container_aspect(self) -> ContainerClass:
292
+ databaseKey = DatabaseKey(
293
+ platform=self.entity.orchestrator,
294
+ instance=self.entity.platform_instance,
295
+ env=self.entity.env,
296
+ database=self.entity.db,
297
+ )
298
+ return ContainerClass(container=databaseKey.as_urn())
@@ -108,6 +108,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
108
108
  default=True,
109
109
  description="Enable lineage extraction for stored procedures",
110
110
  )
111
+ include_containers_for_pipelines: bool = Field(
112
+ default=False,
113
+ description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
114
+ )
111
115
 
112
116
  @pydantic.validator("uri_args")
113
117
  def passwords_match(cls, v, values, **kwargs):
@@ -641,6 +645,12 @@ class SQLServerSource(SQLAlchemySource):
641
645
  aspect=data_platform_instance_aspect,
642
646
  ).as_workunit()
643
647
 
648
+ if self.config.include_containers_for_pipelines:
649
+ yield MetadataChangeProposalWrapper(
650
+ entityUrn=data_job.urn,
651
+ aspect=data_job.as_container_aspect,
652
+ ).as_workunit()
653
+
644
654
  if include_lineage:
645
655
  yield MetadataChangeProposalWrapper(
646
656
  entityUrn=data_job.urn,
@@ -683,6 +693,13 @@ class SQLServerSource(SQLAlchemySource):
683
693
  entityUrn=data_flow.urn,
684
694
  aspect=data_platform_instance_aspect,
685
695
  ).as_workunit()
696
+
697
+ if self.config.include_containers_for_pipelines:
698
+ yield MetadataChangeProposalWrapper(
699
+ entityUrn=data_flow.urn,
700
+ aspect=data_flow.as_container_aspect,
701
+ ).as_workunit()
702
+
686
703
  # TODO: Add SubType when it appear
687
704
 
688
705
  def get_inspectors(self) -> Iterable[Inspector]:
@@ -2428,10 +2428,12 @@ class TableauSiteSource:
2428
2428
  ]
2429
2429
  ],
2430
2430
  ) -> Optional["SqlParsingResult"]:
2431
- database_info = datasource.get(c.DATABASE) or {
2432
- c.NAME: c.UNKNOWN.lower(),
2433
- c.CONNECTION_TYPE: datasource.get(c.CONNECTION_TYPE),
2434
- }
2431
+ database_field = datasource.get(c.DATABASE) or {}
2432
+ database_id: Optional[str] = database_field.get(c.ID)
2433
+ database_name: Optional[str] = database_field.get(c.NAME) or c.UNKNOWN.lower()
2434
+ database_connection_type: Optional[str] = database_field.get(
2435
+ c.CONNECTION_TYPE
2436
+ ) or datasource.get(c.CONNECTION_TYPE)
2435
2437
 
2436
2438
  if (
2437
2439
  datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False)
@@ -2440,10 +2442,7 @@ class TableauSiteSource:
2440
2442
  logger.debug(f"datasource {datasource_urn} is not created from custom sql")
2441
2443
  return None
2442
2444
 
2443
- if (
2444
- database_info.get(c.NAME) is None
2445
- or database_info.get(c.CONNECTION_TYPE) is None
2446
- ):
2445
+ if database_connection_type is None:
2447
2446
  logger.debug(
2448
2447
  f"database information is missing from datasource {datasource_urn}"
2449
2448
  )
@@ -2459,14 +2458,14 @@ class TableauSiteSource:
2459
2458
 
2460
2459
  logger.debug(f"Parsing sql={query}")
2461
2460
 
2462
- upstream_db = database_info.get(c.NAME)
2461
+ upstream_db = database_name
2463
2462
 
2464
2463
  if func_overridden_info is not None:
2465
2464
  # Override the information as per configuration
2466
2465
  upstream_db, platform_instance, platform, _ = func_overridden_info(
2467
- database_info[c.CONNECTION_TYPE],
2468
- database_info.get(c.NAME),
2469
- database_info.get(c.ID),
2466
+ database_connection_type,
2467
+ database_name,
2468
+ database_id,
2470
2469
  self.config.platform_instance_map,
2471
2470
  self.config.lineage_overrides,
2472
2471
  self.config.database_hostname_to_platform_instance_map,
@@ -2534,6 +2533,9 @@ class TableauSiteSource:
2534
2533
  platform_instance=self.config.platform_instance,
2535
2534
  func_overridden_info=get_overridden_info,
2536
2535
  )
2536
+ logger.debug(
2537
+ f"_create_lineage_from_unsupported_csql parsed_result = {parsed_result}"
2538
+ )
2537
2539
 
2538
2540
  if parsed_result is None:
2539
2541
  return
@@ -761,7 +761,7 @@ class TableauUpstreamReference:
761
761
 
762
762
 
763
763
  def get_overridden_info(
764
- connection_type: Optional[str],
764
+ connection_type: str,
765
765
  upstream_db: Optional[str],
766
766
  upstream_db_id: Optional[str],
767
767
  platform_instance_map: Optional[Dict[str, str]],