acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
43
43
  "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
44
44
  "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
45
45
  "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
46
+ "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
46
47
  }
47
48
 
48
49
  _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
104
105
  description="Connect args to pass to Snowflake SqlAlchemy driver",
105
106
  exclude=True,
106
107
  )
108
+ token: Optional[str] = pydantic.Field(
109
+ default=None,
110
+ description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
111
+ )
107
112
 
108
113
  def get_account(self) -> str:
109
114
  assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
148
153
  logger.info(f"using authenticator type '{v}'")
149
154
  return v
150
155
 
156
+ @pydantic.validator("token", always=True)
157
+ def validate_token_oauth_config(cls, v, values):
158
+ auth_type = values.get("authentication_type")
159
+ if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
160
+ if not v:
161
+ raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
162
+ elif v is not None:
163
+ raise ValueError(
164
+ "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
165
+ )
166
+ return v
167
+
151
168
  @staticmethod
152
169
  def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
153
170
  if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
333
350
  application=_APPLICATION_NAME,
334
351
  **connect_args,
335
352
  )
353
+ elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
354
+ return snowflake.connector.connect(
355
+ user=self.username,
356
+ account=self.account_id,
357
+ authenticator="oauth",
358
+ token=self.token, # Token generated externally and provided directly to the recipe
359
+ warehouse=self.warehouse,
360
+ role=self.role,
361
+ application=_APPLICATION_NAME,
362
+ **connect_args,
363
+ )
336
364
  elif self.authentication_type == "OAUTH_AUTHENTICATOR":
337
365
  return self.get_oauth_connection()
338
366
  elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
413
413
  return UpstreamLineageEdge.parse_obj(db_row)
414
414
  except Exception as e:
415
415
  self.report.num_upstream_lineage_edge_parsing_failed += 1
416
+ upstream_tables = db_row.get("UPSTREAM_TABLES")
417
+ downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
416
418
  self.structured_reporter.warning(
417
419
  "Failed to parse lineage edge",
418
- context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
420
+ # Tricky: sometimes the full row data is too large, and so the context
421
+ # message gets truncated. By pulling out the upstreams and downstream
422
+ # list, we can at least get the important fields if truncation does occur.
423
+ context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
419
424
  exc=e,
420
425
  )
421
426
  return None
@@ -129,10 +129,12 @@ class SnowflakeQuery:
129
129
  row_count AS "ROW_COUNT",
130
130
  bytes AS "BYTES",
131
131
  clustering_key AS "CLUSTERING_KEY",
132
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
132
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
133
+ is_dynamic AS "IS_DYNAMIC",
134
+ is_iceberg AS "IS_ICEBERG"
133
135
  FROM {db_clause}information_schema.tables t
134
136
  WHERE table_schema != 'INFORMATION_SCHEMA'
135
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
137
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
136
138
  order by table_schema, table_name"""
137
139
 
138
140
  @staticmethod
@@ -149,10 +151,12 @@ class SnowflakeQuery:
149
151
  row_count AS "ROW_COUNT",
150
152
  bytes AS "BYTES",
151
153
  clustering_key AS "CLUSTERING_KEY",
152
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
154
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
155
+ is_dynamic AS "IS_DYNAMIC",
156
+ is_iceberg AS "IS_ICEBERG"
153
157
  FROM {db_clause}information_schema.tables t
154
158
  where table_schema='{schema_name}'
155
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
159
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
156
160
  order by table_schema, table_name"""
157
161
 
158
162
  @staticmethod
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
233
237
  LIMIT {limit} {from_clause};
234
238
  """
235
239
 
240
+ @staticmethod
241
+ def get_secure_view_definitions() -> str:
242
+ # https://docs.snowflake.com/en/sql-reference/account-usage/views
243
+ return """
244
+ SELECT
245
+ TABLE_CATALOG as "TABLE_CATALOG",
246
+ TABLE_SCHEMA as "TABLE_SCHEMA",
247
+ TABLE_NAME as "TABLE_NAME",
248
+ VIEW_DEFINITION as "VIEW_DEFINITION"
249
+ FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
250
+ WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
251
+ """
252
+
236
253
  @staticmethod
237
254
  def columns_for_schema(
238
255
  schema_name: str,
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
113
113
  external_lineage_queries_secs: float = -1
114
114
  num_tables_with_known_upstreams: int = 0
115
115
  num_upstream_lineage_edge_parsing_failed: int = 0
116
+ num_secure_views_missing_definition: int = 0
116
117
 
117
118
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
118
119
 
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
90
90
  foreign_keys: List[SnowflakeFK] = field(default_factory=list)
91
91
  tags: Optional[List[SnowflakeTag]] = None
92
92
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
93
+ is_dynamic: bool = False
94
+ is_iceberg: bool = False
95
+
96
+ @property
97
+ def is_hybrid(self) -> bool:
98
+ return self.type is not None and self.type == "HYBRID TABLE"
93
99
 
94
100
 
95
101
  @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
98
104
  columns: List[SnowflakeColumn] = field(default_factory=list)
99
105
  tags: Optional[List[SnowflakeTag]] = None
100
106
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
107
+ is_secure: bool = False
101
108
 
102
109
 
103
110
  @dataclass
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
259
266
  snowflake_schemas.append(snowflake_schema)
260
267
  return snowflake_schemas
261
268
 
269
+ @serialized_lru_cache(maxsize=1)
270
+ def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
271
+ secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
272
+ lambda: defaultdict(lambda: defaultdict())
273
+ )
274
+ cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
275
+ for view in cur:
276
+ db_name = view["TABLE_CATALOG"]
277
+ schema_name = view["TABLE_SCHEMA"]
278
+ view_name = view["TABLE_NAME"]
279
+ secure_view_definitions[db_name][schema_name][view_name] = view[
280
+ "VIEW_DEFINITION"
281
+ ]
282
+
283
+ return secure_view_definitions
284
+
262
285
  @serialized_lru_cache(maxsize=1)
263
286
  def get_tables_for_database(
264
287
  self, db_name: str
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
289
312
  rows_count=table["ROW_COUNT"],
290
313
  comment=table["COMMENT"],
291
314
  clustering_key=table["CLUSTERING_KEY"],
315
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
316
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
292
317
  )
293
318
  )
294
319
  return tables
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
313
338
  rows_count=table["ROW_COUNT"],
314
339
  comment=table["COMMENT"],
315
340
  clustering_key=table["CLUSTERING_KEY"],
341
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
342
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
316
343
  )
317
344
  )
318
345
  return tables
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
356
383
  materialized=(
357
384
  view.get("is_materialized", "false").lower() == "true"
358
385
  ),
386
+ is_secure=(view.get("is_secure", "false").lower() == "true"),
359
387
  )
360
388
  )
361
389
 
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
103
103
  logger = logging.getLogger(__name__)
104
104
 
105
105
  # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
106
+ # TODO: Move to the standardized types in sql_types.py
106
107
  SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
107
108
  "DATE": DateType,
108
109
  "BIGINT": NumberType,
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
423
424
  view_identifier = self.identifiers.get_dataset_identifier(
424
425
  view.name, schema_name, db_name
425
426
  )
427
+ if view.is_secure and not view.view_definition:
428
+ view.view_definition = self.fetch_secure_view_definition(
429
+ view.name, schema_name, db_name
430
+ )
426
431
  if view.view_definition:
427
432
  self.aggregator.add_view_definition(
428
433
  view_urn=self.identifiers.gen_dataset_urn(view_identifier),
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
430
435
  default_db=db_name,
431
436
  default_schema=schema_name,
432
437
  )
438
+ elif view.is_secure:
439
+ self.report.num_secure_views_missing_definition += 1
433
440
 
434
441
  if self.config.include_technical_schema:
435
442
  for view in views:
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
446
453
  context=f"{db_name}.{schema_name}",
447
454
  )
448
455
 
456
+ def fetch_secure_view_definition(
457
+ self, table_name: str, schema_name: str, db_name: str
458
+ ) -> Optional[str]:
459
+ try:
460
+ view_definitions = self.data_dictionary.get_secure_view_definitions()
461
+ return view_definitions[db_name][schema_name][table_name]
462
+ except Exception as e:
463
+ if isinstance(e, SnowflakePermissionError):
464
+ error_msg = (
465
+ "Failed to get secure views definitions. Please check permissions."
466
+ )
467
+ else:
468
+ error_msg = "Failed to get secure views definitions"
469
+ self.structured_reporter.warning(
470
+ error_msg,
471
+ exc=e,
472
+ )
473
+ return None
474
+
449
475
  def fetch_views_for_schema(
450
476
  self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
451
477
  ) -> List[SnowflakeView]:
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
748
774
  ) -> DatasetProperties:
749
775
  custom_properties = {}
750
776
 
751
- if isinstance(table, SnowflakeTable) and table.clustering_key:
752
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
777
+ if isinstance(table, SnowflakeTable):
778
+ if table.clustering_key:
779
+ custom_properties["CLUSTERING_KEY"] = table.clustering_key
780
+
781
+ if table.is_hybrid:
782
+ custom_properties["IS_HYBRID"] = "true"
783
+
784
+ if table.is_dynamic:
785
+ custom_properties["IS_DYNAMIC"] = "true"
786
+
787
+ if table.is_iceberg:
788
+ custom_properties["IS_ICEBERG"] = "true"
789
+
790
+ if isinstance(table, SnowflakeView) and table.is_secure:
791
+ custom_properties["IS_SECURE"] = "true"
753
792
 
754
793
  return DatasetProperties(
755
794
  name=table.name,
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  from functools import cached_property
3
- from typing import ClassVar, Literal, Optional, Tuple
3
+ from typing import ClassVar, List, Literal, Optional, Tuple
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -119,7 +119,6 @@ class SnowflakeFilter:
119
119
  ) -> bool:
120
120
  if not dataset_type or not dataset_name:
121
121
  return True
122
- dataset_params = dataset_name.split(".")
123
122
  if dataset_type.lower() not in (
124
123
  SnowflakeObjectDomain.TABLE,
125
124
  SnowflakeObjectDomain.EXTERNAL_TABLE,
@@ -131,6 +130,7 @@ class SnowflakeFilter:
131
130
  if _is_sys_table(dataset_name):
132
131
  return False
133
132
 
133
+ dataset_params = _split_qualified_name(dataset_name)
134
134
  if len(dataset_params) != 3:
135
135
  self.structured_reporter.info(
136
136
  title="Unexpected dataset pattern",
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
184
184
  return table_name.lower().startswith("sys$")
185
185
 
186
186
 
187
+ def _split_qualified_name(qualified_name: str) -> List[str]:
188
+ """
189
+ Split a qualified name into its constituent parts.
190
+
191
+ >>> _split_qualified_name("db.my_schema.my_table")
192
+ ['db', 'my_schema', 'my_table']
193
+ >>> _split_qualified_name('"db"."my_schema"."my_table"')
194
+ ['db', 'my_schema', 'my_table']
195
+ >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
196
+ ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
197
+ >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
198
+ ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
199
+ """
200
+
201
+ # Fast path - no quotes.
202
+ if '"' not in qualified_name:
203
+ return qualified_name.split(".")
204
+
205
+ # First pass - split on dots that are not inside quotes.
206
+ in_quote = False
207
+ parts: List[List[str]] = [[]]
208
+ for char in qualified_name:
209
+ if char == '"':
210
+ in_quote = not in_quote
211
+ elif char == "." and not in_quote:
212
+ parts.append([])
213
+ else:
214
+ parts[-1].append(char)
215
+
216
+ # Second pass - remove outer pairs of quotes.
217
+ result = []
218
+ for part in parts:
219
+ if len(part) > 2 and part[0] == '"' and part[-1] == '"':
220
+ part = part[1:-1]
221
+
222
+ result.append("".join(part))
223
+
224
+ return result
225
+
226
+
187
227
  # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
188
228
  # For example "test-database"."test-schema".test_table
189
229
  # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
192
232
  def _cleanup_qualified_name(
193
233
  qualified_name: str, structured_reporter: SourceReport
194
234
  ) -> str:
195
- name_parts = qualified_name.split(".")
235
+ name_parts = _split_qualified_name(qualified_name)
196
236
  if len(name_parts) != 3:
197
237
  if not _is_sys_table(qualified_name):
198
238
  structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
203
243
  )
204
244
  return qualified_name.replace('"', "")
205
245
  return _combine_identifier_parts(
206
- db_name=name_parts[0].strip('"'),
207
- schema_name=name_parts[1].strip('"'),
208
- table_name=name_parts[2].strip('"'),
246
+ db_name=name_parts[0],
247
+ schema_name=name_parts[1],
248
+ table_name=name_parts[2],
209
249
  )
210
250
 
211
251
 
@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
17
17
  support_status,
18
18
  )
19
19
  from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
20
+ from datahub.ingestion.api.incremental_properties_helper import (
21
+ auto_incremental_properties,
22
+ )
20
23
  from datahub.ingestion.api.source import (
21
24
  CapabilityReport,
22
25
  MetadataWorkUnitProcessor,
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
446
449
  functools.partial(
447
450
  auto_incremental_lineage, self.config.incremental_lineage
448
451
  ),
452
+ functools.partial(
453
+ auto_incremental_properties, self.config.incremental_properties
454
+ ),
449
455
  StaleEntityRemovalHandler.create(
450
456
  self, self.config, self.ctx
451
457
  ).workunit_processor,
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
26
26
  platform_name,
27
27
  support_status,
28
28
  )
29
+ from datahub.ingestion.api.source import StructuredLogLevel
29
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
31
32
  from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
35
36
  register_custom_type,
36
37
  )
37
38
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
39
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
38
40
  from datahub.ingestion.source.sql.sql_utils import (
39
41
  add_table_to_schema_container,
40
42
  gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
48
50
  get_schema_fields_for_sqlalchemy_column,
49
51
  )
50
52
 
53
+ try:
54
+ from typing_extensions import override
55
+ except ImportError:
56
+ _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
57
+
58
+ def override(f: _F, /) -> _F: # noqa: F811
59
+ return f
60
+
61
+
51
62
  logger = logging.getLogger(__name__)
52
63
 
53
64
  assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
322
333
  - Profiling when enabled.
323
334
  """
324
335
 
325
- table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
336
+ config: AthenaConfig
337
+ report: SQLSourceReport
326
338
 
327
339
  def __init__(self, config, ctx):
328
340
  super().__init__(config, ctx, "athena")
329
341
  self.cursor: Optional[BaseCursor] = None
330
342
 
343
+ self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
344
+
331
345
  @classmethod
332
346
  def create(cls, config_dict, ctx):
333
347
  config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
452
466
  )
453
467
 
454
468
  # It seems like database/schema filter in the connection string does not work and this to work around that
469
+ @override
455
470
  def get_schema_names(self, inspector: Inspector) -> List[str]:
456
471
  athena_config = typing.cast(AthenaConfig, self.config)
457
472
  schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
459
474
  return [schema for schema in schemas if schema == athena_config.database]
460
475
  return schemas
461
476
 
462
- # Overwrite to get partitions
477
+ @classmethod
478
+ def _casted_partition_key(cls, key: str) -> str:
479
+ # We need to cast the partition keys to a VARCHAR, since otherwise
480
+ # Athena may throw an error during concatenation / comparison.
481
+ return f"CAST({key} as VARCHAR)"
482
+
483
+ @override
463
484
  def get_partitions(
464
485
  self, inspector: Inspector, schema: str, table: str
465
- ) -> List[str]:
466
- partitions = []
467
-
468
- athena_config = typing.cast(AthenaConfig, self.config)
469
-
470
- if not athena_config.extract_partitions:
471
- return []
486
+ ) -> Optional[List[str]]:
487
+ if not self.config.extract_partitions:
488
+ return None
472
489
 
473
490
  if not self.cursor:
474
- return []
491
+ return None
475
492
 
476
493
  metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
477
494
  table_name=table, schema_name=schema
478
495
  )
479
496
 
480
- if metadata.partition_keys:
481
- for key in metadata.partition_keys:
482
- if key.name:
483
- partitions.append(key.name)
484
-
485
- if not partitions:
486
- return []
497
+ partitions = []
498
+ for key in metadata.partition_keys:
499
+ if key.name:
500
+ partitions.append(key.name)
501
+ if not partitions:
502
+ return []
487
503
 
488
- # We create an artiificaial concatenated partition key to be able to query max partition easier
489
- part_concat = "|| '-' ||".join(partitions)
504
+ with self.report.report_exc(
505
+ message="Failed to extract partition details",
506
+ context=f"{schema}.{table}",
507
+ level=StructuredLogLevel.WARN,
508
+ ):
509
+ # We create an artifical concatenated partition key to be able to query max partition easier
510
+ part_concat = " || '-' || ".join(
511
+ self._casted_partition_key(key) for key in partitions
512
+ )
490
513
  max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
491
514
  ret = self.cursor.execute(max_partition_query)
492
515
  max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
500
523
  partitions=partitions,
501
524
  max_partition=max_partition,
502
525
  )
503
- return partitions
504
526
 
505
- return []
527
+ return partitions
506
528
 
507
529
  # Overwrite to modify the creation of schema fields
508
530
  def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
551
573
  if partition and partition.max_partition:
552
574
  max_partition_filters = []
553
575
  for key, value in partition.max_partition.items():
554
- max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
576
+ max_partition_filters.append(
577
+ f"{self._casted_partition_key(key)} = '{value}'"
578
+ )
555
579
  max_partition = str(partition.max_partition)
556
580
  return (
557
581
  max_partition,
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
5
5
 
6
6
  import pydantic
7
7
  import sqlalchemy.dialects.mssql
8
-
9
- # This import verifies that the dependencies are available.
10
8
  from pydantic.fields import Field
11
9
  from sqlalchemy import create_engine, inspect
12
10
  from sqlalchemy.engine.base import Connection
@@ -50,6 +48,7 @@ from datahub.ingestion.source.sql.sql_config import (
50
48
  BasicSQLAlchemyConfig,
51
49
  make_sqlalchemy_uri,
52
50
  )
51
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
53
52
  from datahub.metadata.schema_classes import (
54
53
  BooleanTypeClass,
55
54
  NumberTypeClass,
@@ -78,6 +77,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
78
77
  include_stored_procedures_code: bool = Field(
79
78
  default=True, description="Include information about object code."
80
79
  )
80
+ procedure_pattern: AllowDenyPattern = Field(
81
+ default=AllowDenyPattern.allow_all(),
82
+ description="Regex patterns for stored procedures to filter in ingestion."
83
+ "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
84
+ )
81
85
  include_jobs: bool = Field(
82
86
  default=True,
83
87
  description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
@@ -164,6 +168,8 @@ class SQLServerSource(SQLAlchemySource):
164
168
  If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure.
165
169
  """
166
170
 
171
+ report: SQLSourceReport
172
+
167
173
  def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
168
174
  super().__init__(config, ctx, "mssql")
169
175
  # Cache the table and column descriptions
@@ -416,10 +422,16 @@ class SQLServerSource(SQLAlchemySource):
416
422
  data_flow = MSSQLDataFlow(entity=mssql_default_job)
417
423
  with inspector.engine.connect() as conn:
418
424
  procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
419
- procedures = [
420
- StoredProcedure(flow=mssql_default_job, **procedure_data)
421
- for procedure_data in procedures_data_list
422
- ]
425
+ procedures: List[StoredProcedure] = []
426
+ for procedure_data in procedures_data_list:
427
+ procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}"
428
+ if not self.config.procedure_pattern.allowed(procedure_full_name):
429
+ self.report.report_dropped(procedure_full_name)
430
+ continue
431
+ procedures.append(
432
+ StoredProcedure(flow=mssql_default_job, **procedure_data)
433
+ )
434
+
423
435
  if procedures:
424
436
  yield from self.construct_flow_workunits(data_flow=data_flow)
425
437
  for procedure in procedures: