acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from functools import partial
3
- from typing import Any, Dict, List, Optional, Union, cast
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  from lark import Token, Tree
6
6
 
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
58
58
  if isinstance(node, Token):
59
59
  return None
60
60
 
61
- for child in cast(Tree, node).children:
61
+ for child in node.children:
62
62
  child_node: Optional[Tree] = internal(child)
63
63
  if child_node is not None:
64
64
  return child_node
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
99
99
  logger.debug(f"Unable to resolve parameter reference to {ref}")
100
100
  values.append(ref)
101
101
  elif isinstance(node, Token):
102
- values.append(cast(Token, node).value)
102
+ values.append(node.value)
103
103
  return
104
104
  else:
105
105
  for child in node.children:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Optional, Tuple
3
3
 
4
- from datahub.ingestion.source.powerbi.m_query import resolver
4
+ import datahub.ingestion.source.powerbi.m_query.data_classes
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
7
 
@@ -14,12 +14,18 @@ def validate_parse_tree(
14
14
  :param native_query_enabled: Whether user want to extract lineage from native query
15
15
  :return: True or False.
16
16
  """
17
- function_names = [fun.value for fun in resolver.FunctionName]
17
+ function_names = [
18
+ fun.value
19
+ for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
20
+ ]
18
21
  if not any(fun in expression for fun in function_names):
19
22
  return False, "DataAccess function is not present in M-Query expression."
20
23
 
21
24
  if native_query_enabled is False:
22
- if resolver.FunctionName.NATIVE_QUERY.value in function_names:
25
+ if (
26
+ datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
27
+ in function_names
28
+ ):
23
29
  return (
24
30
  False,
25
31
  "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",
@@ -10,6 +10,7 @@ from typing import Iterable, List, Optional, Tuple, Union
10
10
  import more_itertools
11
11
 
12
12
  import datahub.emitter.mce_builder as builder
13
+ import datahub.ingestion.source.powerbi.m_query.data_classes
13
14
  import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
14
15
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
16
  from datahub.emitter.mcp_builder import ContainerKey, gen_containers
@@ -42,12 +43,13 @@ from datahub.ingestion.source.powerbi.config import (
42
43
  Constant,
43
44
  PowerBiDashboardSourceConfig,
44
45
  PowerBiDashboardSourceReport,
46
+ SupportedDataPlatform,
45
47
  )
46
48
  from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
47
49
  AbstractDataPlatformInstanceResolver,
48
50
  create_dataplatform_instance_resolver,
49
51
  )
50
- from datahub.ingestion.source.powerbi.m_query import parser, resolver
52
+ from datahub.ingestion.source.powerbi.m_query import parser
51
53
  from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
52
54
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
53
55
  StaleEntityRemovalHandler,
@@ -182,7 +184,9 @@ class Mapper:
182
184
  return [schema_mcp]
183
185
 
184
186
  def make_fine_grained_lineage_class(
185
- self, lineage: resolver.Lineage, dataset_urn: str
187
+ self,
188
+ lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
189
+ dataset_urn: str,
186
190
  ) -> List[FineGrainedLineage]:
187
191
  fine_grained_lineages: List[FineGrainedLineage] = []
188
192
 
@@ -234,7 +238,9 @@ class Mapper:
234
238
  upstream: List[UpstreamClass] = []
235
239
  cll_lineage: List[FineGrainedLineage] = []
236
240
 
237
- upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables(
241
+ upstream_lineage: List[
242
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
243
+ ] = parser.get_upstream_tables(
238
244
  table=table,
239
245
  reporter=self.__reporter,
240
246
  platform_instance_resolver=self.__dataplatform_instance_resolver,
@@ -1294,7 +1300,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1294
1300
  def validate_dataset_type_mapping(self):
1295
1301
  powerbi_data_platforms: List[str] = [
1296
1302
  data_platform.value.powerbi_data_platform_name
1297
- for data_platform in resolver.SupportedDataPlatform
1303
+ for data_platform in SupportedDataPlatform
1298
1304
  ]
1299
1305
 
1300
1306
  for key in self.source_config.dataset_type_mapping.keys():
@@ -1481,7 +1487,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1481
1487
 
1482
1488
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1483
1489
  # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
1484
- # This will result in creating checkpoint for each workspace_id
1490
+ # This will result in creating a checkpoint for each workspace_id
1485
1491
  if self.source_config.modified_since:
1486
1492
  return [] # Handle these in get_workunits_internal
1487
1493
  else:
@@ -1492,7 +1498,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1492
1498
 
1493
1499
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1494
1500
  """
1495
- Datahub Ingestion framework invoke this method
1501
+ Datahub Ingestion framework invokes this method
1496
1502
  """
1497
1503
  logger.info("PowerBi plugin execution is started")
1498
1504
  # Validate dataset type mapping
@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
85
85
  super().__init__(ctx, config)
86
86
  self.config = config
87
87
  self.report = StaleEntityRemovalSourceReport()
88
+ self.platform = "preset"
88
89
 
89
90
  def login(self):
90
91
  try:
@@ -78,8 +78,27 @@ class PulsarSchema:
78
78
  def __init__(self, schema):
79
79
  self.schema_version = schema.get("version")
80
80
 
81
- avro_schema = json.loads(schema.get("data"))
82
- self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
81
+ schema_data = schema.get("data")
82
+ if not schema_data:
83
+ logger.warning("Schema data is empty or None. Using default empty schema.")
84
+ schema_data = "{}"
85
+
86
+ try:
87
+ avro_schema = json.loads(schema_data)
88
+ except json.JSONDecodeError as e:
89
+ logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
90
+ avro_schema = {}
91
+
92
+ self.schema_name = "null"
93
+ if avro_schema.get("namespace") and avro_schema.get("name"):
94
+ self.schema_name = (
95
+ avro_schema.get("namespace") + "." + avro_schema.get("name")
96
+ )
97
+ elif avro_schema.get("namespace"):
98
+ self.schema_name = avro_schema.get("namespace")
99
+ elif avro_schema.get("name"):
100
+ self.schema_name = avro_schema.get("name")
101
+
83
102
  self.schema_description = avro_schema.get("doc")
84
103
  self.schema_type = schema.get("type")
85
104
  self.schema_str = schema.get("data")
@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
15
15
  TimeType,
16
16
  )
17
17
 
18
+ # TODO: Replace with standardized types in sql_types.py
18
19
  FIELD_TYPE_MAPPING: Dict[
19
20
  str,
20
21
  Type[
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set, Type
5
+ from typing import Dict, Iterable, List, Optional, Set
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.registry import import_path
26
25
  from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
27
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
27
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
39
38
  ChartTypeClass,
40
39
  DashboardInfoClass,
41
40
  )
41
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
42
  from datahub.utilities.lossy_collections import LossyDict, LossyList
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
- from datahub.utilities.sql_parser_base import SQLParser
45
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
46
45
 
47
46
  logger = logging.getLogger(__name__)
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
270
269
  parse_table_names_from_sql: bool = Field(
271
270
  default=False, description="See note below."
272
271
  )
273
- sql_parser: str = Field(
274
- default="datahub.utilities.sql_parser.DefaultSQLParser",
275
- description="custom SQL parser. See note below for details.",
276
- )
277
272
 
278
273
  env: str = Field(
279
274
  default=DEFAULT_ENV,
@@ -354,7 +349,6 @@ class RedashSource(Source):
354
349
  self.api_page_limit = self.config.api_page_limit or math.inf
355
350
 
356
351
  self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
357
- self.sql_parser_path = self.config.sql_parser
358
352
 
359
353
  logger.info(
360
354
  f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
@@ -380,31 +374,6 @@ class RedashSource(Source):
380
374
  config = RedashConfig.parse_obj(config_dict)
381
375
  return cls(ctx, config)
382
376
 
383
- @classmethod
384
- def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
385
- assert "." in sql_parser_path, "sql_parser-path must contain a ."
386
- parser_cls = import_path(sql_parser_path)
387
-
388
- if not issubclass(parser_cls, SQLParser):
389
- raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
390
- return parser_cls
391
-
392
- @classmethod
393
- def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
394
- parser_cls = cls._import_sql_parser_cls(sql_parser_path)
395
-
396
- try:
397
- sql_table_names: List[str] = parser_cls(sql).get_tables()
398
- except Exception as e:
399
- logger.warning(f"Sql parser failed on {sql} with {e}")
400
- return []
401
-
402
- # Remove quotes from table names
403
- sql_table_names = [t.replace('"', "") for t in sql_table_names]
404
- sql_table_names = [t.replace("`", "") for t in sql_table_names]
405
-
406
- return sql_table_names
407
-
408
377
  def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
409
378
  url = f"/api/data_sources/{data_source_id}"
410
379
  resp = self.client._get(url).json()
@@ -441,14 +410,6 @@ class RedashSource(Source):
441
410
 
442
411
  return database_name
443
412
 
444
- def _construct_datalineage_urn(
445
- self, platform: str, database_name: str, sql_table_name: str
446
- ) -> str:
447
- full_dataset_name = get_full_qualified_name(
448
- platform, database_name, sql_table_name
449
- )
450
- return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
451
-
452
413
  def _get_datasource_urns(
453
414
  self, data_source: Dict, sql_query_data: Dict = {}
454
415
  ) -> Optional[List[str]]:
@@ -464,34 +425,23 @@ class RedashSource(Source):
464
425
  # Getting table lineage from SQL parsing
465
426
  if self.parse_table_names_from_sql and data_source_syntax == "sql":
466
427
  dataset_urns = list()
467
- try:
468
- sql_table_names = self._get_sql_table_names(
469
- query, self.sql_parser_path
470
- )
471
- except Exception as e:
428
+ sql_parser_in_tables = create_lineage_sql_parsed_result(
429
+ query=query,
430
+ platform=platform,
431
+ env=self.config.env,
432
+ platform_instance=None,
433
+ default_db=database_name,
434
+ )
435
+ # make sure dataset_urns is not empty list
436
+ dataset_urns = sql_parser_in_tables.in_tables
437
+ if sql_parser_in_tables.debug_info.table_error:
472
438
  self.report.queries_problem_parsing.add(str(query_id))
473
439
  self.error(
474
440
  logger,
475
441
  "sql-parsing",
476
- f"exception {e} in parsing query-{query_id}-datasource-{data_source_id}",
442
+ f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
477
443
  )
478
- sql_table_names = []
479
- for sql_table_name in sql_table_names:
480
- try:
481
- dataset_urns.append(
482
- self._construct_datalineage_urn(
483
- platform, database_name, sql_table_name
484
- )
485
- )
486
- except Exception:
487
- self.report.queries_problem_parsing.add(str(query_id))
488
- self.warn(
489
- logger,
490
- "data-urn-invalid",
491
- f"Problem making URN for {sql_table_name} parsed from query {query_id}",
492
- )
493
444
 
494
- # make sure dataset_urns is not empty list
495
445
  return dataset_urns if len(dataset_urns) > 0 else None
496
446
 
497
447
  else:
@@ -159,6 +159,7 @@ class RedshiftConfig(
159
159
  description="Whether to extract column level lineage. This config works with rest-sink only.",
160
160
  )
161
161
 
162
+ # TODO - use DatasetPropertiesConfigMixin instead
162
163
  patch_custom_properties: bool = Field(
163
164
  default=True,
164
165
  description="Whether to patch custom properties on existing datasets rather than replace.",
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
222
222
  ```
223
223
  """
224
224
 
225
+ # TODO: Replace with standardized types in sql_types.py
225
226
  REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
226
227
  str,
227
228
  Type[
@@ -830,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
830
831
  customProperties=custom_properties,
831
832
  )
832
833
  if self.config.patch_custom_properties:
834
+ # TODO: use auto_incremental_properties workunit processor instead
835
+ # Deprecate use of patch_custom_properties
833
836
  patch_builder = create_dataset_props_patch_builder(
834
837
  dataset_urn, dataset_properties
835
838
  )
@@ -9,6 +9,7 @@ from datetime import datetime
9
9
  from itertools import groupby
10
10
  from pathlib import PurePath
11
11
  from typing import Any, Dict, Iterable, List, Optional, Tuple
12
+ from urllib.parse import urlparse
12
13
 
13
14
  import smart_open.compression as so_compression
14
15
  from more_itertools import peekable
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
993
994
  folders = []
994
995
  for dir in dirs_to_process:
995
996
  logger.info(f"Getting files from folder: {dir}")
996
- prefix_to_process = dir.rstrip("\\").lstrip(
997
- self.create_s3_path(bucket_name, "/")
998
- )
997
+ prefix_to_process = urlparse(dir).path.lstrip("/")
999
998
 
1000
999
  folders.extend(
1001
1000
  self.get_folder_info(
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
- from typing import Dict, List, Optional, Set, cast
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
7
  from pydantic import Field, SecretStr, root_validator, validator
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
16
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
17
17
  from datahub.configuration.validate_field_removal import pydantic_removed_field
18
18
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
19
+ from datahub.ingestion.api.incremental_properties_helper import (
20
+ IncrementalPropertiesConfigMixin,
21
+ )
19
22
  from datahub.ingestion.glossary.classification_mixin import (
20
23
  ClassificationSourceConfigMixin,
21
24
  )
@@ -118,9 +121,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
118
121
  )
119
122
 
120
123
  # Always exclude reporting metadata for INFORMATION_SCHEMA schema
121
- if schema_pattern is not None and schema_pattern:
124
+ if schema_pattern:
122
125
  logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
123
- cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
126
+ assert isinstance(schema_pattern, AllowDenyPattern)
127
+ schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
124
128
 
125
129
  return values
126
130
 
@@ -187,6 +191,7 @@ class SnowflakeV2Config(
187
191
  StatefulUsageConfigMixin,
188
192
  StatefulProfilingConfigMixin,
189
193
  ClassificationSourceConfigMixin,
194
+ IncrementalPropertiesConfigMixin,
190
195
  ):
191
196
  include_usage_stats: bool = Field(
192
197
  default=True,
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
43
43
  "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
44
44
  "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
45
45
  "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
46
+ "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
46
47
  }
47
48
 
48
49
  _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
104
105
  description="Connect args to pass to Snowflake SqlAlchemy driver",
105
106
  exclude=True,
106
107
  )
108
+ token: Optional[str] = pydantic.Field(
109
+ default=None,
110
+ description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
111
+ )
107
112
 
108
113
  def get_account(self) -> str:
109
114
  assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
148
153
  logger.info(f"using authenticator type '{v}'")
149
154
  return v
150
155
 
156
+ @pydantic.validator("token", always=True)
157
+ def validate_token_oauth_config(cls, v, values):
158
+ auth_type = values.get("authentication_type")
159
+ if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
160
+ if not v:
161
+ raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
162
+ elif v is not None:
163
+ raise ValueError(
164
+ "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
165
+ )
166
+ return v
167
+
151
168
  @staticmethod
152
169
  def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
153
170
  if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
333
350
  application=_APPLICATION_NAME,
334
351
  **connect_args,
335
352
  )
353
+ elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
354
+ return snowflake.connector.connect(
355
+ user=self.username,
356
+ account=self.account_id,
357
+ authenticator="oauth",
358
+ token=self.token, # Token generated externally and provided directly to the recipe
359
+ warehouse=self.warehouse,
360
+ role=self.role,
361
+ application=_APPLICATION_NAME,
362
+ **connect_args,
363
+ )
336
364
  elif self.authentication_type == "OAUTH_AUTHENTICATOR":
337
365
  return self.get_oauth_connection()
338
366
  elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
413
413
  return UpstreamLineageEdge.parse_obj(db_row)
414
414
  except Exception as e:
415
415
  self.report.num_upstream_lineage_edge_parsing_failed += 1
416
+ upstream_tables = db_row.get("UPSTREAM_TABLES")
417
+ downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
416
418
  self.structured_reporter.warning(
417
419
  "Failed to parse lineage edge",
418
- context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
420
+ # Tricky: sometimes the full row data is too large, and so the context
421
+ # message gets truncated. By pulling out the upstreams and downstream
422
+ # list, we can at least get the important fields if truncation does occur.
423
+ context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
419
424
  exc=e,
420
425
  )
421
426
  return None
@@ -129,10 +129,12 @@ class SnowflakeQuery:
129
129
  row_count AS "ROW_COUNT",
130
130
  bytes AS "BYTES",
131
131
  clustering_key AS "CLUSTERING_KEY",
132
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
132
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
133
+ is_dynamic AS "IS_DYNAMIC",
134
+ is_iceberg AS "IS_ICEBERG"
133
135
  FROM {db_clause}information_schema.tables t
134
136
  WHERE table_schema != 'INFORMATION_SCHEMA'
135
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
137
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
136
138
  order by table_schema, table_name"""
137
139
 
138
140
  @staticmethod
@@ -149,10 +151,12 @@ class SnowflakeQuery:
149
151
  row_count AS "ROW_COUNT",
150
152
  bytes AS "BYTES",
151
153
  clustering_key AS "CLUSTERING_KEY",
152
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
154
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
155
+ is_dynamic AS "IS_DYNAMIC",
156
+ is_iceberg AS "IS_ICEBERG"
153
157
  FROM {db_clause}information_schema.tables t
154
158
  where table_schema='{schema_name}'
155
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
159
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
156
160
  order by table_schema, table_name"""
157
161
 
158
162
  @staticmethod
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
233
237
  LIMIT {limit} {from_clause};
234
238
  """
235
239
 
240
+ @staticmethod
241
+ def get_secure_view_definitions() -> str:
242
+ # https://docs.snowflake.com/en/sql-reference/account-usage/views
243
+ return """
244
+ SELECT
245
+ TABLE_CATALOG as "TABLE_CATALOG",
246
+ TABLE_SCHEMA as "TABLE_SCHEMA",
247
+ TABLE_NAME as "TABLE_NAME",
248
+ VIEW_DEFINITION as "VIEW_DEFINITION"
249
+ FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
250
+ WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
251
+ """
252
+
236
253
  @staticmethod
237
254
  def columns_for_schema(
238
255
  schema_name: str,
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
113
113
  external_lineage_queries_secs: float = -1
114
114
  num_tables_with_known_upstreams: int = 0
115
115
  num_upstream_lineage_edge_parsing_failed: int = 0
116
+ num_secure_views_missing_definition: int = 0
116
117
 
117
118
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
118
119
 
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
90
90
  foreign_keys: List[SnowflakeFK] = field(default_factory=list)
91
91
  tags: Optional[List[SnowflakeTag]] = None
92
92
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
93
+ is_dynamic: bool = False
94
+ is_iceberg: bool = False
95
+
96
+ @property
97
+ def is_hybrid(self) -> bool:
98
+ return self.type is not None and self.type == "HYBRID TABLE"
93
99
 
94
100
 
95
101
  @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
98
104
  columns: List[SnowflakeColumn] = field(default_factory=list)
99
105
  tags: Optional[List[SnowflakeTag]] = None
100
106
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
107
+ is_secure: bool = False
101
108
 
102
109
 
103
110
  @dataclass
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
259
266
  snowflake_schemas.append(snowflake_schema)
260
267
  return snowflake_schemas
261
268
 
269
+ @serialized_lru_cache(maxsize=1)
270
+ def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
271
+ secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
272
+ lambda: defaultdict(lambda: defaultdict())
273
+ )
274
+ cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
275
+ for view in cur:
276
+ db_name = view["TABLE_CATALOG"]
277
+ schema_name = view["TABLE_SCHEMA"]
278
+ view_name = view["TABLE_NAME"]
279
+ secure_view_definitions[db_name][schema_name][view_name] = view[
280
+ "VIEW_DEFINITION"
281
+ ]
282
+
283
+ return secure_view_definitions
284
+
262
285
  @serialized_lru_cache(maxsize=1)
263
286
  def get_tables_for_database(
264
287
  self, db_name: str
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
289
312
  rows_count=table["ROW_COUNT"],
290
313
  comment=table["COMMENT"],
291
314
  clustering_key=table["CLUSTERING_KEY"],
315
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
316
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
292
317
  )
293
318
  )
294
319
  return tables
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
313
338
  rows_count=table["ROW_COUNT"],
314
339
  comment=table["COMMENT"],
315
340
  clustering_key=table["CLUSTERING_KEY"],
341
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
342
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
316
343
  )
317
344
  )
318
345
  return tables
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
356
383
  materialized=(
357
384
  view.get("is_materialized", "false").lower() == "true"
358
385
  ),
386
+ is_secure=(view.get("is_secure", "false").lower() == "true"),
359
387
  )
360
388
  )
361
389