acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (95) hide show
  1. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
  2. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
  3. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/__init__.py +1 -25
  5. datahub/_version.py +13 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
  7. datahub/cli/check_cli.py +1 -1
  8. datahub/cli/cli_utils.py +3 -3
  9. datahub/cli/container_cli.py +1 -64
  10. datahub/cli/iceberg_cli.py +707 -0
  11. datahub/cli/ingest_cli.py +2 -2
  12. datahub/emitter/composite_emitter.py +36 -0
  13. datahub/emitter/rest_emitter.py +1 -1
  14. datahub/entrypoints.py +26 -5
  15. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  16. datahub/ingestion/api/registry.py +1 -1
  17. datahub/ingestion/glossary/classification_mixin.py +6 -0
  18. datahub/ingestion/glossary/classifier.py +3 -2
  19. datahub/ingestion/graph/client.py +2 -1
  20. datahub/ingestion/graph/entity_versioning.py +201 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  22. datahub/ingestion/run/connection.py +1 -1
  23. datahub/ingestion/run/pipeline.py +3 -3
  24. datahub/ingestion/source/abs/report.py +2 -2
  25. datahub/ingestion/source/apply/__init__.py +0 -0
  26. datahub/ingestion/source/apply/datahub_apply.py +223 -0
  27. datahub/ingestion/source/aws/glue.py +5 -2
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  29. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  30. datahub/ingestion/source/dbt/dbt_core.py +1 -1
  31. datahub/ingestion/source/delta_lake/report.py +2 -2
  32. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  33. datahub/ingestion/source/elastic_search.py +2 -1
  34. datahub/ingestion/source/ge_profiling_config.py +11 -7
  35. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  36. datahub/ingestion/source/identity/azure_ad.py +6 -14
  37. datahub/ingestion/source/identity/okta.py +2 -1
  38. datahub/ingestion/source/kafka/kafka.py +2 -1
  39. datahub/ingestion/source/kafka_connect/common.py +2 -1
  40. datahub/ingestion/source/ldap.py +2 -1
  41. datahub/ingestion/source/looker/looker_config.py +3 -1
  42. datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
  43. datahub/ingestion/source/looker/looker_file_loader.py +14 -3
  44. datahub/ingestion/source/looker/looker_template_language.py +104 -14
  45. datahub/ingestion/source/looker/lookml_config.py +29 -8
  46. datahub/ingestion/source/looker/lookml_source.py +110 -22
  47. datahub/ingestion/source/mode.py +2 -4
  48. datahub/ingestion/source/mongodb.py +2 -1
  49. datahub/ingestion/source/nifi.py +2 -1
  50. datahub/ingestion/source/powerbi/config.py +2 -2
  51. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  52. datahub/ingestion/source/redash.py +5 -5
  53. datahub/ingestion/source/salesforce.py +4 -1
  54. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  55. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  56. datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
  57. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  58. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
  59. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  60. datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
  61. datahub/ingestion/source/sql/clickhouse.py +5 -43
  62. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  63. datahub/ingestion/source/sql/mssql/source.py +17 -0
  64. datahub/ingestion/source/sql/sql_config.py +0 -10
  65. datahub/ingestion/source/tableau/tableau.py +16 -13
  66. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  67. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  68. datahub/ingestion/source/unity/proxy.py +2 -2
  69. datahub/ingestion/source/unity/report.py +1 -0
  70. datahub/ingestion/source_config/operation_config.py +9 -0
  71. datahub/ingestion/source_report/pulsar.py +5 -4
  72. datahub/metadata/_schema_classes.py +304 -6
  73. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  74. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  75. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  76. datahub/metadata/schema.avsc +211 -12
  77. datahub/metadata/schemas/AssertionInfo.avsc +2 -2
  78. datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
  79. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  80. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  81. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  82. datahub/metadata/schemas/Deprecation.avsc +12 -0
  83. datahub/metadata/schemas/DisplayProperties.avsc +62 -0
  84. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  85. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  86. datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
  87. datahub/metadata/schemas/PostInfo.avsc +28 -2
  88. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  89. datahub/specific/dashboard.py +43 -1
  90. datahub/telemetry/telemetry.py +4 -4
  91. datahub/testing/check_imports.py +28 -0
  92. datahub/upgrade/upgrade.py +17 -9
  93. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
  94. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
  95. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
@@ -249,6 +249,12 @@ class SnowflakeV2Config(
249
249
  description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
250
250
  )
251
251
 
252
+ structured_properties_template_cache_invalidation_interval: int = Field(
253
+ hidden_from_docs=True,
254
+ default=60,
255
+ description="Interval in seconds to invalidate the structured properties template cache.",
256
+ )
257
+
252
258
  include_external_url: bool = Field(
253
259
  default=True,
254
260
  description="Whether to populate Snowsight url for Snowflake Objects",
@@ -302,6 +308,13 @@ class SnowflakeV2Config(
302
308
  " assertions CLI in snowflake",
303
309
  )
304
310
 
311
+ pushdown_deny_usernames: List[str] = Field(
312
+ default=[],
313
+ description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
314
+ "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
315
+ "Only applicable if `use_queries_v2` is enabled.",
316
+ )
317
+
305
318
  @validator("convert_urns_to_lowercase")
306
319
  def validate_convert_urns_to_lowercase(cls, v):
307
320
  if not v:
@@ -159,6 +159,17 @@ class SnowflakeQuery:
159
159
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
160
160
  order by table_schema, table_name"""
161
161
 
162
+ @staticmethod
163
+ def get_all_tags():
164
+ return """
165
+ SELECT tag_database as "TAG_DATABASE",
166
+ tag_schema AS "TAG_SCHEMA",
167
+ tag_name AS "TAG_NAME",
168
+ FROM snowflake.account_usage.tag_references
169
+ GROUP BY TAG_DATABASE , TAG_SCHEMA, tag_name
170
+ ORDER BY TAG_DATABASE, TAG_SCHEMA, TAG_NAME ASC;
171
+ """
172
+
162
173
  @staticmethod
163
174
  def get_all_tags_on_object_with_propagation(
164
175
  db_name: str, quoted_identifier: str, domain: str
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
12
12
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
13
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
14
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
+ from datahub.utilities.lossy_collections import LossyDict
15
16
  from datahub.utilities.perf_timer import PerfTimer
16
17
 
17
18
  if TYPE_CHECKING:
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
66
67
  num_external_table_edges_scanned: int = 0
67
68
  ignore_start_time_lineage: Optional[bool] = None
68
69
  upstream_lineage_in_report: Optional[bool] = None
69
- upstream_lineage: Dict[str, List[str]] = field(default_factory=dict)
70
+ upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
70
71
 
71
72
  lineage_start_time: Optional[datetime] = None
72
73
  lineage_end_time: Optional[datetime] = None
@@ -114,6 +115,7 @@ class SnowflakeV2Report(
114
115
  num_tables_with_known_upstreams: int = 0
115
116
  num_upstream_lineage_edge_parsing_failed: int = 0
116
117
  num_secure_views_missing_definition: int = 0
118
+ num_structured_property_templates_created: int = 0
117
119
 
118
120
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
119
121
 
@@ -285,6 +285,23 @@ class SnowflakeDataDictionary(SupportsAsObj):
285
285
 
286
286
  return secure_view_definitions
287
287
 
288
+ def get_all_tags(self) -> List[SnowflakeTag]:
289
+ cur = self.connection.query(
290
+ SnowflakeQuery.get_all_tags(),
291
+ )
292
+
293
+ tags = [
294
+ SnowflakeTag(
295
+ database=tag["TAG_DATABASE"],
296
+ schema=tag["TAG_SCHEMA"],
297
+ name=tag["TAG_NAME"],
298
+ value="",
299
+ )
300
+ for tag in cur
301
+ ]
302
+
303
+ return tags
304
+
288
305
  @serialized_lru_cache(maxsize=1)
289
306
  def get_tables_for_database(
290
307
  self, db_name: str
@@ -1,10 +1,10 @@
1
1
  import itertools
2
2
  import logging
3
+ import time
3
4
  from typing import Dict, Iterable, List, Optional, Union
4
5
 
5
6
  from datahub.configuration.pattern_utils import is_schema_allowed
6
7
  from datahub.emitter.mce_builder import (
7
- get_sys_time,
8
8
  make_data_platform_urn,
9
9
  make_dataset_urn_with_platform_instance,
10
10
  make_schema_field_urn,
@@ -74,7 +74,6 @@ from datahub.ingestion.source_report.ingestion_stage import (
74
74
  PROFILING,
75
75
  )
76
76
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
77
- AuditStamp,
78
77
  GlobalTags,
79
78
  Status,
80
79
  SubTypes,
@@ -101,15 +100,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
101
100
  StringType,
102
101
  TimeType,
103
102
  )
104
- from datahub.metadata.com.linkedin.pegasus2avro.structured import (
105
- StructuredPropertyDefinition,
106
- )
107
103
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
108
104
  from datahub.metadata.urns import (
109
- ContainerUrn,
110
- DatasetUrn,
111
- DataTypeUrn,
112
- EntityTypeUrn,
113
105
  SchemaFieldUrn,
114
106
  StructuredPropertyUrn,
115
107
  )
@@ -191,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
191
183
  self.domain_registry: Optional[DomainRegistry] = domain_registry
192
184
  self.classification_handler = ClassificationHandler(self.config, self.report)
193
185
  self.tag_extractor = SnowflakeTagExtractor(
194
- config, self.data_dictionary, self.report
186
+ config, self.data_dictionary, self.report, identifiers
195
187
  )
196
188
  self.profiler: Optional[SnowflakeProfiler] = profiler
197
189
  self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
@@ -217,6 +209,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
217
209
  return self.identifiers.snowflake_identifier(identifier)
218
210
 
219
211
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
212
+ if self.config.extract_tags_as_structured_properties:
213
+ logger.info("Creating structured property templates for tags")
214
+ yield from self.tag_extractor.create_structured_property_templates()
215
+ # We have to wait until cache invalidates to make sure the structured property template is available
216
+ logger.info(
217
+ f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
218
+ )
219
+ time.sleep(
220
+ self.config.structured_properties_template_cache_invalidation_interval
221
+ )
220
222
  self.databases = []
221
223
  for database in self.get_databases() or []:
222
224
  self.report.report_entity_scanned(database.name, "database")
@@ -491,15 +493,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
491
493
  try:
492
494
  view_definitions = self.data_dictionary.get_secure_view_definitions()
493
495
  return view_definitions[db_name][schema_name][table_name]
496
+ except KeyError:
497
+ # Received secure view definitions but the view is not present in results
498
+ self.structured_reporter.info(
499
+ title="Secure view definition not found",
500
+ message="Lineage will be missing for the view.",
501
+ context=f"{db_name}.{schema_name}.{table_name}",
502
+ )
503
+ return None
494
504
  except Exception as e:
495
- if isinstance(e, SnowflakePermissionError):
496
- error_msg = (
497
- "Failed to get secure views definitions. Please check permissions."
498
- )
499
- else:
500
- error_msg = "Failed to get secure views definitions"
505
+ action_msg = (
506
+ "Please check permissions."
507
+ if isinstance(e, SnowflakePermissionError)
508
+ else ""
509
+ )
510
+
501
511
  self.structured_reporter.warning(
502
- error_msg,
512
+ title="Failed to get secure views definitions",
513
+ message=f"Lineage will be missing for the view. {action_msg}",
514
+ context=f"{db_name}.{schema_name}.{table_name}",
503
515
  exc=e,
504
516
  )
505
517
  return None
@@ -688,6 +700,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
688
700
 
689
701
  def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
690
702
  use_sp = self.config.extract_tags_as_structured_properties
703
+
691
704
  identifier = (
692
705
  self.snowflake_identifier(tag.structured_property_identifier())
693
706
  if use_sp
@@ -698,10 +711,11 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
698
711
  return
699
712
 
700
713
  self.report.report_tag_processed(identifier)
714
+
701
715
  if use_sp:
702
- yield from self.gen_tag_as_structured_property_workunits(tag)
703
- else:
704
- yield from self.gen_tag_workunits(tag)
716
+ return
717
+
718
+ yield from self.gen_tag_workunits(tag)
705
719
 
706
720
  def _format_tags_as_structured_properties(
707
721
  self, tags: List[SnowflakeTag]
@@ -722,6 +736,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
722
736
  if table.tags:
723
737
  for tag in table.tags:
724
738
  yield from self._process_tag(tag)
739
+
725
740
  for column_name in table.column_tags:
726
741
  for tag in table.column_tags[column_name]:
727
742
  yield from self._process_tag(tag)
@@ -893,29 +908,6 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
893
908
  entityUrn=tag_urn, aspect=tag_properties_aspect
894
909
  ).as_workunit()
895
910
 
896
- def gen_tag_as_structured_property_workunits(
897
- self, tag: SnowflakeTag
898
- ) -> Iterable[MetadataWorkUnit]:
899
- identifier = self.snowflake_identifier(tag.structured_property_identifier())
900
- urn = StructuredPropertyUrn(identifier).urn()
901
- aspect = StructuredPropertyDefinition(
902
- qualifiedName=identifier,
903
- displayName=tag.name,
904
- valueType=DataTypeUrn("datahub.string").urn(),
905
- entityTypes=[
906
- EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
907
- EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
908
- EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
909
- ],
910
- lastModified=AuditStamp(
911
- time=get_sys_time(), actor="urn:li:corpuser:datahub"
912
- ),
913
- )
914
- yield MetadataChangeProposalWrapper(
915
- entityUrn=urn,
916
- aspect=aspect,
917
- ).as_workunit()
918
-
919
911
  def gen_column_tags_as_structured_properties(
920
912
  self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
921
913
  ) -> Iterable[MetadataWorkUnit]:
@@ -1,6 +1,9 @@
1
1
  import logging
2
- from typing import Dict, List, Optional
2
+ from typing import Dict, Iterable, List, Optional
3
3
 
4
+ from datahub.emitter.mce_builder import get_sys_time
5
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
6
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
4
7
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
5
8
  from datahub.ingestion.source.snowflake.snowflake_config import (
6
9
  SnowflakeV2Config,
@@ -12,7 +15,22 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
12
15
  SnowflakeTag,
13
16
  _SnowflakeTagCache,
14
17
  )
15
- from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
18
+ from datahub.ingestion.source.snowflake.snowflake_utils import (
19
+ SnowflakeCommonMixin,
20
+ SnowflakeIdentifierBuilder,
21
+ )
22
+ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
+ from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
+ StructuredPropertyDefinition,
25
+ )
26
+ from datahub.metadata.urns import (
27
+ ContainerUrn,
28
+ DatasetUrn,
29
+ DataTypeUrn,
30
+ EntityTypeUrn,
31
+ SchemaFieldUrn,
32
+ StructuredPropertyUrn,
33
+ )
16
34
 
17
35
  logger: logging.Logger = logging.getLogger(__name__)
18
36
 
@@ -23,11 +41,12 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
23
41
  config: SnowflakeV2Config,
24
42
  data_dictionary: SnowflakeDataDictionary,
25
43
  report: SnowflakeV2Report,
44
+ snowflake_identifiers: SnowflakeIdentifierBuilder,
26
45
  ) -> None:
27
46
  self.config = config
28
47
  self.data_dictionary = data_dictionary
29
48
  self.report = report
30
-
49
+ self.snowflake_identifiers = snowflake_identifiers
31
50
  self.tag_cache: Dict[str, _SnowflakeTagCache] = {}
32
51
 
33
52
  def _get_tags_on_object_without_propagation(
@@ -59,6 +78,41 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
59
78
  raise ValueError(f"Unknown domain {domain}")
60
79
  return tags
61
80
 
81
+ def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
+ for tag in self.data_dictionary.get_all_tags():
83
+ if not self.config.structured_property_pattern.allowed(
84
+ tag.tag_identifier()
85
+ ):
86
+ continue
87
+ if self.config.extract_tags_as_structured_properties:
88
+ self.report.num_structured_property_templates_created += 1
89
+ yield from self.gen_tag_as_structured_property_workunits(tag)
90
+
91
+ def gen_tag_as_structured_property_workunits(
92
+ self, tag: SnowflakeTag
93
+ ) -> Iterable[MetadataWorkUnit]:
94
+ identifier = self.snowflake_identifiers.snowflake_identifier(
95
+ tag.structured_property_identifier()
96
+ )
97
+ urn = StructuredPropertyUrn(identifier).urn()
98
+ aspect = StructuredPropertyDefinition(
99
+ qualifiedName=identifier,
100
+ displayName=tag.name,
101
+ valueType=DataTypeUrn("datahub.string").urn(),
102
+ entityTypes=[
103
+ EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
104
+ EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
105
+ EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
106
+ ],
107
+ lastModified=AuditStamp(
108
+ time=get_sys_time(), actor="urn:li:corpuser:datahub"
109
+ ),
110
+ )
111
+ yield MetadataChangeProposalWrapper(
112
+ entityUrn=urn,
113
+ aspect=aspect,
114
+ ).as_workunit()
115
+
62
116
  def _get_tags_on_object_with_propagation(
63
117
  self,
64
118
  domain: str,
@@ -5,6 +5,7 @@ import logging
5
5
  import os
6
6
  import os.path
7
7
  import platform
8
+ import re
8
9
  from dataclasses import dataclass
9
10
  from typing import Dict, Iterable, List, Optional, Union
10
11
 
@@ -33,6 +34,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
33
34
  from datahub.ingestion.source.snowflake.constants import (
34
35
  GENERIC_PERMISSION_ERROR_KEY,
35
36
  SnowflakeEdition,
37
+ SnowflakeObjectDomain,
36
38
  )
37
39
  from datahub.ingestion.source.snowflake.snowflake_assertion import (
38
40
  SnowflakeAssertionsHandler,
@@ -162,6 +164,8 @@ class SnowflakeV2Source(
162
164
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
165
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
166
 
167
+ self.discovered_datasets: Optional[List[str]] = None
168
+
165
169
  self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
166
170
  SqlParsingAggregator(
167
171
  platform=self.identifiers.platform,
@@ -182,6 +186,8 @@ class SnowflakeV2Source(
182
186
  generate_usage_statistics=False,
183
187
  generate_operations=False,
184
188
  format_queries=self.config.format_sql_queries,
189
+ is_temp_table=self._is_temp_table,
190
+ is_allowed_table=self._is_allowed_table,
185
191
  )
186
192
  )
187
193
  self.report.sql_aggregator = self.aggregator.report
@@ -444,6 +450,34 @@ class SnowflakeV2Source(
444
450
 
445
451
  return _report
446
452
 
453
+ def _is_temp_table(self, name: str) -> bool:
454
+ if any(
455
+ re.match(pattern, name, flags=re.IGNORECASE)
456
+ for pattern in self.config.temporary_tables_pattern
457
+ ):
458
+ return True
459
+
460
+ # This is also a temp table if
461
+ # 1. this name would be allowed by the dataset patterns, and
462
+ # 2. we have a list of discovered tables, and
463
+ # 3. it's not in the discovered tables list
464
+ if (
465
+ self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE)
466
+ and self.discovered_datasets
467
+ and name not in self.discovered_datasets
468
+ ):
469
+ return True
470
+
471
+ return False
472
+
473
+ def _is_allowed_table(self, name: str) -> bool:
474
+ if self.discovered_datasets and name not in self.discovered_datasets:
475
+ return False
476
+
477
+ return self.filters.is_dataset_pattern_allowed(
478
+ name, SnowflakeObjectDomain.TABLE
479
+ )
480
+
447
481
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
448
482
  return [
449
483
  *super().get_workunit_processors(),
@@ -513,7 +547,7 @@ class SnowflakeV2Source(
513
547
  )
514
548
  return
515
549
 
516
- discovered_datasets = discovered_tables + discovered_views
550
+ self.discovered_datasets = discovered_tables + discovered_views
517
551
 
518
552
  if self.config.use_queries_v2:
519
553
  with self.report.new_stage(f"*: {VIEW_PARSING}"):
@@ -533,18 +567,20 @@ class SnowflakeV2Source(
533
567
  include_queries=self.config.include_queries,
534
568
  include_query_usage_statistics=self.config.include_query_usage_statistics,
535
569
  user_email_pattern=self.config.user_email_pattern,
570
+ pushdown_deny_usernames=self.config.pushdown_deny_usernames,
536
571
  ),
537
572
  structured_report=self.report,
538
573
  filters=self.filters,
539
574
  identifiers=self.identifiers,
540
575
  schema_resolver=schema_resolver,
541
- discovered_tables=discovered_datasets,
576
+ discovered_tables=self.discovered_datasets,
542
577
  graph=self.ctx.graph,
543
578
  )
544
579
 
545
580
  # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
546
581
  # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
547
582
  # it should be pretty straightforward to refactor this and only initialize the aggregator once.
583
+ # This also applies for the _is_temp_table and _is_allowed_table methods above, duplicated from SnowflakeQueriesExtractor.
548
584
  self.report.queries_extractor = queries_extractor.report
549
585
  yield from queries_extractor.get_workunits_internal()
550
586
  queries_extractor.close()
@@ -568,12 +604,14 @@ class SnowflakeV2Source(
568
604
  if (
569
605
  self.config.include_usage_stats or self.config.include_operational_stats
570
606
  ) and self.usage_extractor:
571
- yield from self.usage_extractor.get_usage_workunits(discovered_datasets)
607
+ yield from self.usage_extractor.get_usage_workunits(
608
+ self.discovered_datasets
609
+ )
572
610
 
573
611
  if self.config.include_assertion_results:
574
612
  yield from SnowflakeAssertionsHandler(
575
613
  self.config, self.report, self.connection, self.identifiers
576
- ).get_assertion_workunits(discovered_datasets)
614
+ ).get_assertion_workunits(self.discovered_datasets)
577
615
 
578
616
  self.connection.close()
579
617
 
@@ -53,7 +53,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
53
53
  )
54
54
  from datahub.metadata.schema_classes import (
55
55
  DatasetLineageTypeClass,
56
- DatasetPropertiesClass,
57
56
  DatasetSnapshotClass,
58
57
  UpstreamClass,
59
58
  )
@@ -418,41 +417,11 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
418
417
  dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot
419
418
  assert dataset_snapshot
420
419
 
421
- lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp(
422
- wu.metadata.proposedSnapshot.urn
423
- )
420
+ lineage_mcp = self.get_lineage_mcp(wu.metadata.proposedSnapshot.urn)
424
421
 
425
422
  if lineage_mcp is not None:
426
423
  yield lineage_mcp.as_workunit()
427
424
 
428
- if lineage_properties_aspect:
429
- aspects = dataset_snapshot.aspects
430
- if aspects is None:
431
- aspects = []
432
-
433
- dataset_properties_aspect: Optional[DatasetPropertiesClass] = None
434
-
435
- for aspect in aspects:
436
- if isinstance(aspect, DatasetPropertiesClass):
437
- dataset_properties_aspect = aspect
438
-
439
- if dataset_properties_aspect is None:
440
- dataset_properties_aspect = DatasetPropertiesClass()
441
- aspects.append(dataset_properties_aspect)
442
-
443
- custom_properties = (
444
- {
445
- **dataset_properties_aspect.customProperties,
446
- **lineage_properties_aspect.customProperties,
447
- }
448
- if dataset_properties_aspect.customProperties
449
- else lineage_properties_aspect.customProperties
450
- )
451
- dataset_properties_aspect.customProperties = custom_properties
452
- dataset_snapshot.aspects = aspects
453
-
454
- dataset_snapshot.aspects.append(dataset_properties_aspect)
455
-
456
425
  # Emit the work unit from super.
457
426
  yield wu
458
427
 
@@ -656,19 +625,16 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
656
625
 
657
626
  def get_lineage_mcp(
658
627
  self, dataset_urn: str
659
- ) -> Tuple[
660
- Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]
661
- ]:
628
+ ) -> Optional[MetadataChangeProposalWrapper]:
662
629
  dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
663
630
  if dataset_key is None:
664
- return None, None
631
+ return None
665
632
 
666
633
  if not self._lineage_map:
667
634
  self._populate_lineage()
668
635
  assert self._lineage_map is not None
669
636
 
670
637
  upstream_lineage: List[UpstreamClass] = []
671
- custom_properties: Dict[str, str] = {}
672
638
 
673
639
  if dataset_key.name in self._lineage_map:
674
640
  item = self._lineage_map[dataset_key.name]
@@ -684,16 +650,12 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
684
650
  )
685
651
  upstream_lineage.append(upstream_table)
686
652
 
687
- properties = None
688
- if custom_properties:
689
- properties = DatasetPropertiesClass(customProperties=custom_properties)
690
-
691
653
  if not upstream_lineage:
692
- return None, properties
654
+ return None
693
655
 
694
656
  mcp = MetadataChangeProposalWrapper(
695
657
  entityUrn=dataset_urn,
696
658
  aspect=UpstreamLineage(upstreams=upstream_lineage),
697
659
  )
698
660
 
699
- return mcp, properties
661
+ return mcp
@@ -7,7 +7,12 @@ from datahub.emitter.mce_builder import (
7
7
  make_data_platform_urn,
8
8
  make_dataplatform_instance_urn,
9
9
  )
10
+ from datahub.emitter.mcp_builder import (
11
+ DatabaseKey,
12
+ SchemaKey,
13
+ )
10
14
  from datahub.metadata.schema_classes import (
15
+ ContainerClass,
11
16
  DataFlowInfoClass,
12
17
  DataJobInfoClass,
13
18
  DataJobInputOutputClass,
@@ -171,11 +176,7 @@ class MSSQLDataJob:
171
176
  flow_id=self.entity.flow.formatted_name,
172
177
  job_id=self.entity.formatted_name,
173
178
  cluster=self.entity.flow.cluster,
174
- platform_instance=(
175
- self.entity.flow.platform_instance
176
- if self.entity.flow.platform_instance
177
- else None
178
- ),
179
+ platform_instance=self.entity.flow.platform_instance,
179
180
  )
180
181
 
181
182
  def add_property(
@@ -222,6 +223,26 @@ class MSSQLDataJob:
222
223
  )
223
224
  return None
224
225
 
226
+ @property
227
+ def as_container_aspect(self) -> ContainerClass:
228
+ key_args = dict(
229
+ platform=self.entity.flow.orchestrator,
230
+ instance=self.entity.flow.platform_instance,
231
+ env=self.entity.flow.env,
232
+ database=self.entity.flow.db,
233
+ )
234
+ container_key = (
235
+ SchemaKey(
236
+ schema=self.entity.schema,
237
+ **key_args,
238
+ )
239
+ if isinstance(self.entity, StoredProcedure)
240
+ else DatabaseKey(
241
+ **key_args,
242
+ )
243
+ )
244
+ return ContainerClass(container=container_key.as_urn())
245
+
225
246
 
226
247
  @dataclass
227
248
  class MSSQLDataFlow:
@@ -244,9 +265,7 @@ class MSSQLDataFlow:
244
265
  orchestrator=self.entity.orchestrator,
245
266
  flow_id=self.entity.formatted_name,
246
267
  cluster=self.entity.cluster,
247
- platform_instance=(
248
- self.entity.platform_instance if self.entity.platform_instance else None
249
- ),
268
+ platform_instance=self.entity.platform_instance,
250
269
  )
251
270
 
252
271
  @property
@@ -267,3 +286,13 @@ class MSSQLDataFlow:
267
286
  ),
268
287
  )
269
288
  return None
289
+
290
+ @property
291
+ def as_container_aspect(self) -> ContainerClass:
292
+ databaseKey = DatabaseKey(
293
+ platform=self.entity.orchestrator,
294
+ instance=self.entity.platform_instance,
295
+ env=self.entity.env,
296
+ database=self.entity.db,
297
+ )
298
+ return ContainerClass(container=databaseKey.as_urn())
@@ -108,6 +108,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
108
108
  default=True,
109
109
  description="Enable lineage extraction for stored procedures",
110
110
  )
111
+ include_containers_for_pipelines: bool = Field(
112
+ default=False,
113
+ description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
114
+ )
111
115
 
112
116
  @pydantic.validator("uri_args")
113
117
  def passwords_match(cls, v, values, **kwargs):
@@ -641,6 +645,12 @@ class SQLServerSource(SQLAlchemySource):
641
645
  aspect=data_platform_instance_aspect,
642
646
  ).as_workunit()
643
647
 
648
+ if self.config.include_containers_for_pipelines:
649
+ yield MetadataChangeProposalWrapper(
650
+ entityUrn=data_job.urn,
651
+ aspect=data_job.as_container_aspect,
652
+ ).as_workunit()
653
+
644
654
  if include_lineage:
645
655
  yield MetadataChangeProposalWrapper(
646
656
  entityUrn=data_job.urn,
@@ -683,6 +693,13 @@ class SQLServerSource(SQLAlchemySource):
683
693
  entityUrn=data_flow.urn,
684
694
  aspect=data_platform_instance_aspect,
685
695
  ).as_workunit()
696
+
697
+ if self.config.include_containers_for_pipelines:
698
+ yield MetadataChangeProposalWrapper(
699
+ entityUrn=data_flow.urn,
700
+ aspect=data_flow.as_container_aspect,
701
+ ).as_workunit()
702
+
686
703
  # TODO: Add SubType when it appear
687
704
 
688
705
  def get_inspectors(self) -> Iterable[Inspector]: