acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from functools import partial
3
- from typing import Any, Dict, List, Optional, Union, cast
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  from lark import Token, Tree
6
6
 
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
58
58
  if isinstance(node, Token):
59
59
  return None
60
60
 
61
- for child in cast(Tree, node).children:
61
+ for child in node.children:
62
62
  child_node: Optional[Tree] = internal(child)
63
63
  if child_node is not None:
64
64
  return child_node
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
99
99
  logger.debug(f"Unable to resolve parameter reference to {ref}")
100
100
  values.append(ref)
101
101
  elif isinstance(node, Token):
102
- values.append(cast(Token, node).value)
102
+ values.append(node.value)
103
103
  return
104
104
  else:
105
105
  for child in node.children:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import Optional, Tuple
3
3
 
4
- from datahub.ingestion.source.powerbi.m_query import resolver
4
+ import datahub.ingestion.source.powerbi.m_query.data_classes
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
7
 
@@ -14,12 +14,18 @@ def validate_parse_tree(
14
14
  :param native_query_enabled: Whether user want to extract lineage from native query
15
15
  :return: True or False.
16
16
  """
17
- function_names = [fun.value for fun in resolver.FunctionName]
17
+ function_names = [
18
+ fun.value
19
+ for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
20
+ ]
18
21
  if not any(fun in expression for fun in function_names):
19
22
  return False, "DataAccess function is not present in M-Query expression."
20
23
 
21
24
  if native_query_enabled is False:
22
- if resolver.FunctionName.NATIVE_QUERY.value in function_names:
25
+ if (
26
+ datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
27
+ in function_names
28
+ ):
23
29
  return (
24
30
  False,
25
31
  "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",
@@ -10,6 +10,7 @@ from typing import Iterable, List, Optional, Tuple, Union
10
10
  import more_itertools
11
11
 
12
12
  import datahub.emitter.mce_builder as builder
13
+ import datahub.ingestion.source.powerbi.m_query.data_classes
13
14
  import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
14
15
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
16
  from datahub.emitter.mcp_builder import ContainerKey, gen_containers
@@ -42,12 +43,13 @@ from datahub.ingestion.source.powerbi.config import (
42
43
  Constant,
43
44
  PowerBiDashboardSourceConfig,
44
45
  PowerBiDashboardSourceReport,
46
+ SupportedDataPlatform,
45
47
  )
46
48
  from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
47
49
  AbstractDataPlatformInstanceResolver,
48
50
  create_dataplatform_instance_resolver,
49
51
  )
50
- from datahub.ingestion.source.powerbi.m_query import parser, resolver
52
+ from datahub.ingestion.source.powerbi.m_query import parser
51
53
  from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
52
54
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
53
55
  StaleEntityRemovalHandler,
@@ -182,7 +184,9 @@ class Mapper:
182
184
  return [schema_mcp]
183
185
 
184
186
  def make_fine_grained_lineage_class(
185
- self, lineage: resolver.Lineage, dataset_urn: str
187
+ self,
188
+ lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
189
+ dataset_urn: str,
186
190
  ) -> List[FineGrainedLineage]:
187
191
  fine_grained_lineages: List[FineGrainedLineage] = []
188
192
 
@@ -234,7 +238,9 @@ class Mapper:
234
238
  upstream: List[UpstreamClass] = []
235
239
  cll_lineage: List[FineGrainedLineage] = []
236
240
 
237
- upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables(
241
+ upstream_lineage: List[
242
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
243
+ ] = parser.get_upstream_tables(
238
244
  table=table,
239
245
  reporter=self.__reporter,
240
246
  platform_instance_resolver=self.__dataplatform_instance_resolver,
@@ -1294,7 +1300,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1294
1300
  def validate_dataset_type_mapping(self):
1295
1301
  powerbi_data_platforms: List[str] = [
1296
1302
  data_platform.value.powerbi_data_platform_name
1297
- for data_platform in resolver.SupportedDataPlatform
1303
+ for data_platform in SupportedDataPlatform
1298
1304
  ]
1299
1305
 
1300
1306
  for key in self.source_config.dataset_type_mapping.keys():
@@ -1481,7 +1487,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1481
1487
 
1482
1488
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1483
1489
  # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
1484
- # This will result in creating checkpoint for each workspace_id
1490
+ # This will result in creating a checkpoint for each workspace_id
1485
1491
  if self.source_config.modified_since:
1486
1492
  return [] # Handle these in get_workunits_internal
1487
1493
  else:
@@ -1492,7 +1498,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1492
1498
 
1493
1499
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1494
1500
  """
1495
- Datahub Ingestion framework invoke this method
1501
+ Datahub Ingestion framework invokes this method
1496
1502
  """
1497
1503
  logger.info("PowerBi plugin execution is started")
1498
1504
  # Validate dataset type mapping
@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
85
85
  super().__init__(ctx, config)
86
86
  self.config = config
87
87
  self.report = StaleEntityRemovalSourceReport()
88
+ self.platform = "preset"
88
89
 
89
90
  def login(self):
90
91
  try:
@@ -78,8 +78,27 @@ class PulsarSchema:
78
78
  def __init__(self, schema):
79
79
  self.schema_version = schema.get("version")
80
80
 
81
- avro_schema = json.loads(schema.get("data"))
82
- self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
81
+ schema_data = schema.get("data")
82
+ if not schema_data:
83
+ logger.warning("Schema data is empty or None. Using default empty schema.")
84
+ schema_data = "{}"
85
+
86
+ try:
87
+ avro_schema = json.loads(schema_data)
88
+ except json.JSONDecodeError as e:
89
+ logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
90
+ avro_schema = {}
91
+
92
+ self.schema_name = "null"
93
+ if avro_schema.get("namespace") and avro_schema.get("name"):
94
+ self.schema_name = (
95
+ avro_schema.get("namespace") + "." + avro_schema.get("name")
96
+ )
97
+ elif avro_schema.get("namespace"):
98
+ self.schema_name = avro_schema.get("namespace")
99
+ elif avro_schema.get("name"):
100
+ self.schema_name = avro_schema.get("name")
101
+
83
102
  self.schema_description = avro_schema.get("doc")
84
103
  self.schema_type = schema.get("type")
85
104
  self.schema_str = schema.get("data")
@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
15
15
  TimeType,
16
16
  )
17
17
 
18
+ # TODO: Replace with standardized types in sql_types.py
18
19
  FIELD_TYPE_MAPPING: Dict[
19
20
  str,
20
21
  Type[
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set, Type
5
+ from typing import Dict, Iterable, List, Optional, Set
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.registry import import_path
26
25
  from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
27
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
27
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
39
38
  ChartTypeClass,
40
39
  DashboardInfoClass,
41
40
  )
41
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
42
  from datahub.utilities.lossy_collections import LossyDict, LossyList
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
- from datahub.utilities.sql_parser_base import SQLParser
45
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
46
45
 
47
46
  logger = logging.getLogger(__name__)
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
270
269
  parse_table_names_from_sql: bool = Field(
271
270
  default=False, description="See note below."
272
271
  )
273
- sql_parser: str = Field(
274
- default="datahub.utilities.sql_parser.DefaultSQLParser",
275
- description="custom SQL parser. See note below for details.",
276
- )
277
272
 
278
273
  env: str = Field(
279
274
  default=DEFAULT_ENV,
@@ -354,7 +349,6 @@ class RedashSource(Source):
354
349
  self.api_page_limit = self.config.api_page_limit or math.inf
355
350
 
356
351
  self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
357
- self.sql_parser_path = self.config.sql_parser
358
352
 
359
353
  logger.info(
360
354
  f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
@@ -380,31 +374,6 @@ class RedashSource(Source):
380
374
  config = RedashConfig.parse_obj(config_dict)
381
375
  return cls(ctx, config)
382
376
 
383
- @classmethod
384
- def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
385
- assert "." in sql_parser_path, "sql_parser-path must contain a ."
386
- parser_cls = import_path(sql_parser_path)
387
-
388
- if not issubclass(parser_cls, SQLParser):
389
- raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
390
- return parser_cls
391
-
392
- @classmethod
393
- def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
394
- parser_cls = cls._import_sql_parser_cls(sql_parser_path)
395
-
396
- try:
397
- sql_table_names: List[str] = parser_cls(sql).get_tables()
398
- except Exception as e:
399
- logger.warning(f"Sql parser failed on {sql} with {e}")
400
- return []
401
-
402
- # Remove quotes from table names
403
- sql_table_names = [t.replace('"', "") for t in sql_table_names]
404
- sql_table_names = [t.replace("`", "") for t in sql_table_names]
405
-
406
- return sql_table_names
407
-
408
377
  def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
409
378
  url = f"/api/data_sources/{data_source_id}"
410
379
  resp = self.client._get(url).json()
@@ -441,14 +410,6 @@ class RedashSource(Source):
441
410
 
442
411
  return database_name
443
412
 
444
- def _construct_datalineage_urn(
445
- self, platform: str, database_name: str, sql_table_name: str
446
- ) -> str:
447
- full_dataset_name = get_full_qualified_name(
448
- platform, database_name, sql_table_name
449
- )
450
- return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
451
-
452
413
  def _get_datasource_urns(
453
414
  self, data_source: Dict, sql_query_data: Dict = {}
454
415
  ) -> Optional[List[str]]:
@@ -464,34 +425,23 @@ class RedashSource(Source):
464
425
  # Getting table lineage from SQL parsing
465
426
  if self.parse_table_names_from_sql and data_source_syntax == "sql":
466
427
  dataset_urns = list()
467
- try:
468
- sql_table_names = self._get_sql_table_names(
469
- query, self.sql_parser_path
470
- )
471
- except Exception as e:
428
+ sql_parser_in_tables = create_lineage_sql_parsed_result(
429
+ query=query,
430
+ platform=platform,
431
+ env=self.config.env,
432
+ platform_instance=None,
433
+ default_db=database_name,
434
+ )
435
+ # make sure dataset_urns is not empty list
436
+ dataset_urns = sql_parser_in_tables.in_tables
437
+ if sql_parser_in_tables.debug_info.table_error:
472
438
  self.report.queries_problem_parsing.add(str(query_id))
473
439
  self.error(
474
440
  logger,
475
441
  "sql-parsing",
476
- f"exception {e} in parsing query-{query_id}-datasource-{data_source_id}",
442
+ f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
477
443
  )
478
- sql_table_names = []
479
- for sql_table_name in sql_table_names:
480
- try:
481
- dataset_urns.append(
482
- self._construct_datalineage_urn(
483
- platform, database_name, sql_table_name
484
- )
485
- )
486
- except Exception:
487
- self.report.queries_problem_parsing.add(str(query_id))
488
- self.warn(
489
- logger,
490
- "data-urn-invalid",
491
- f"Problem making URN for {sql_table_name} parsed from query {query_id}",
492
- )
493
444
 
494
- # make sure dataset_urns is not empty list
495
445
  return dataset_urns if len(dataset_urns) > 0 else None
496
446
 
497
447
  else:
@@ -159,6 +159,7 @@ class RedshiftConfig(
159
159
  description="Whether to extract column level lineage. This config works with rest-sink only.",
160
160
  )
161
161
 
162
+ # TODO - use DatasetPropertiesConfigMixin instead
162
163
  patch_custom_properties: bool = Field(
163
164
  default=True,
164
165
  description="Whether to patch custom properties on existing datasets rather than replace.",
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
222
222
  ```
223
223
  """
224
224
 
225
+ # TODO: Replace with standardized types in sql_types.py
225
226
  REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
226
227
  str,
227
228
  Type[
@@ -830,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
830
831
  customProperties=custom_properties,
831
832
  )
832
833
  if self.config.patch_custom_properties:
834
+ # TODO: use auto_incremental_properties workunit processor instead
835
+ # Deprecate use of patch_custom_properties
833
836
  patch_builder = create_dataset_props_patch_builder(
834
837
  dataset_urn, dataset_properties
835
838
  )
@@ -9,6 +9,7 @@ from datetime import datetime
9
9
  from itertools import groupby
10
10
  from pathlib import PurePath
11
11
  from typing import Any, Dict, Iterable, List, Optional, Tuple
12
+ from urllib.parse import urlparse
12
13
 
13
14
  import smart_open.compression as so_compression
14
15
  from more_itertools import peekable
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
993
994
  folders = []
994
995
  for dir in dirs_to_process:
995
996
  logger.info(f"Getting files from folder: {dir}")
996
- prefix_to_process = dir.rstrip("\\").lstrip(
997
- self.create_s3_path(bucket_name, "/")
998
- )
997
+ prefix_to_process = urlparse(dir).path.lstrip("/")
999
998
 
1000
999
  folders.extend(
1001
1000
  self.get_folder_info(
@@ -80,6 +80,7 @@ class Workbook(BaseModel):
80
80
  path: str
81
81
  latestVersion: int
82
82
  workspaceId: Optional[str] = None
83
+ description: Optional[str] = None
83
84
  pages: List[Page] = []
84
85
  badge: Optional[str] = None
85
86
 
@@ -4,7 +4,12 @@ from typing import Dict, Iterable, List, Optional
4
4
  import datahub.emitter.mce_builder as builder
5
5
  from datahub.configuration.common import ConfigurationError
6
6
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
7
- from datahub.emitter.mcp_builder import add_entity_to_container, gen_containers
7
+ from datahub.emitter.mcp_builder import (
8
+ add_entity_to_container,
9
+ add_owner_to_entity_wu,
10
+ add_tags_to_entity_wu,
11
+ gen_containers,
12
+ )
8
13
  from datahub.ingestion.api.common import PipelineContext
9
14
  from datahub.ingestion.api.decorators import (
10
15
  SourceCapability,
@@ -59,12 +64,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
59
64
  UpstreamLineage,
60
65
  )
61
66
  from datahub.metadata.schema_classes import (
67
+ AuditStampClass,
62
68
  BrowsePathEntryClass,
63
69
  BrowsePathsV2Class,
64
70
  ChangeAuditStampsClass,
65
71
  ChartInfoClass,
66
72
  DashboardInfoClass,
67
73
  DataPlatformInstanceClass,
74
+ EdgeClass,
68
75
  GlobalTagsClass,
69
76
  InputFieldClass,
70
77
  InputFieldsClass,
@@ -74,6 +81,7 @@ from datahub.metadata.schema_classes import (
74
81
  SchemaFieldClass,
75
82
  SchemaFieldDataTypeClass,
76
83
  StringTypeClass,
84
+ SubTypesClass,
77
85
  TagAssociationClass,
78
86
  )
79
87
  from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
@@ -257,11 +265,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
257
265
  entries = [
258
266
  BrowsePathEntryClass(id=parent_entity_urn, urn=parent_entity_urn)
259
267
  ] + [BrowsePathEntryClass(id=path) for path in paths]
260
- if self.config.platform_instance:
261
- urn = builder.make_dataplatform_instance_urn(
262
- self.platform, self.config.platform_instance
263
- )
264
- entries = [BrowsePathEntryClass(id=urn, urn=urn)] + entries
265
268
  return MetadataChangeProposalWrapper(
266
269
  entityUrn=entity_urn,
267
270
  aspect=BrowsePathsV2Class(entries),
@@ -424,11 +427,11 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
424
427
  elements: List[Element],
425
428
  workbook: Workbook,
426
429
  all_input_fields: List[InputFieldClass],
430
+ paths: List[str],
427
431
  ) -> Iterable[MetadataWorkUnit]:
428
432
  """
429
433
  Map Sigma page element to Datahub Chart
430
434
  """
431
-
432
435
  for element in elements:
433
436
  chart_urn = builder.make_chart_urn(
434
437
  platform=self.platform,
@@ -459,11 +462,14 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
459
462
  ),
460
463
  ).as_workunit()
461
464
 
462
- yield from add_entity_to_container(
463
- container_key=self._gen_workbook_key(workbook.workbookId),
464
- entity_type="chart",
465
- entity_urn=chart_urn,
466
- )
465
+ if workbook.workspaceId:
466
+ yield self._gen_entity_browsepath_aspect(
467
+ entity_urn=chart_urn,
468
+ parent_entity_urn=builder.make_container_urn(
469
+ self._gen_workspace_key(workbook.workspaceId)
470
+ ),
471
+ paths=paths + [workbook.name],
472
+ )
467
473
 
468
474
  # Add sigma dataset's upstream dataset urn mapping
469
475
  for dataset_urn, upstream_dataset_urns in inputs.items():
@@ -494,7 +500,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
494
500
 
495
501
  all_input_fields.extend(element_input_fields)
496
502
 
497
- def _gen_pages_workunit(self, workbook: Workbook) -> Iterable[MetadataWorkUnit]:
503
+ def _gen_pages_workunit(
504
+ self, workbook: Workbook, paths: List[str]
505
+ ) -> Iterable[MetadataWorkUnit]:
498
506
  """
499
507
  Map Sigma workbook page to Datahub dashboard
500
508
  """
@@ -505,20 +513,23 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
505
513
 
506
514
  yield self._gen_dashboard_info_workunit(page)
507
515
 
508
- yield from add_entity_to_container(
509
- container_key=self._gen_workbook_key(workbook.workbookId),
510
- entity_type="dashboard",
511
- entity_urn=dashboard_urn,
512
- )
513
-
514
516
  dpi_aspect = self._gen_dataplatform_instance_aspect(dashboard_urn)
515
517
  if dpi_aspect:
516
518
  yield dpi_aspect
517
519
 
518
520
  all_input_fields: List[InputFieldClass] = []
519
521
 
522
+ if workbook.workspaceId:
523
+ yield self._gen_entity_browsepath_aspect(
524
+ entity_urn=dashboard_urn,
525
+ parent_entity_urn=builder.make_container_urn(
526
+ self._gen_workspace_key(workbook.workspaceId)
527
+ ),
528
+ paths=paths + [workbook.name],
529
+ )
530
+
520
531
  yield from self._gen_elements_workunit(
521
- page.elements, workbook, all_input_fields
532
+ page.elements, workbook, all_input_fields, paths
522
533
  )
523
534
 
524
535
  yield MetadataChangeProposalWrapper(
@@ -531,42 +542,89 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
531
542
  Map Sigma Workbook to Datahub container
532
543
  """
533
544
  owner_username = self.sigma_api.get_user_name(workbook.createdBy)
534
- workbook_key = self._gen_workbook_key(workbook.workbookId)
535
- yield from gen_containers(
536
- container_key=workbook_key,
537
- name=workbook.name,
538
- sub_types=[BIContainerSubTypes.SIGMA_WORKBOOK],
539
- parent_container_key=(
540
- self._gen_workspace_key(workbook.workspaceId)
541
- if workbook.workspaceId
542
- else None
545
+
546
+ dashboard_urn = self._gen_dashboard_urn(workbook.workbookId)
547
+
548
+ yield self._gen_entity_status_aspect(dashboard_urn)
549
+
550
+ lastModified = AuditStampClass(
551
+ time=int(workbook.updatedAt.timestamp() * 1000),
552
+ actor="urn:li:corpuser:datahub",
553
+ )
554
+ created = AuditStampClass(
555
+ time=int(workbook.createdAt.timestamp() * 1000),
556
+ actor="urn:li:corpuser:datahub",
557
+ )
558
+
559
+ dashboard_info_cls = DashboardInfoClass(
560
+ title=workbook.name,
561
+ description=workbook.description if workbook.description else "",
562
+ dashboards=[
563
+ EdgeClass(
564
+ destinationUrn=self._gen_dashboard_urn(page.get_urn_part()),
565
+ sourceUrn=dashboard_urn,
566
+ )
567
+ for page in workbook.pages
568
+ ],
569
+ externalUrl=workbook.url,
570
+ lastModified=ChangeAuditStampsClass(
571
+ created=created, lastModified=lastModified
543
572
  ),
544
- extra_properties={
573
+ customProperties={
545
574
  "path": workbook.path,
546
575
  "latestVersion": str(workbook.latestVersion),
547
576
  },
548
- owner_urn=(
549
- builder.make_user_urn(owner_username)
550
- if self.config.ingest_owner and owner_username
551
- else None
552
- ),
553
- external_url=workbook.url,
554
- tags=[workbook.badge] if workbook.badge else None,
555
- created=int(workbook.createdAt.timestamp() * 1000),
556
- last_modified=int(workbook.updatedAt.timestamp() * 1000),
557
577
  )
578
+ yield MetadataChangeProposalWrapper(
579
+ entityUrn=dashboard_urn, aspect=dashboard_info_cls
580
+ ).as_workunit()
581
+
582
+ # Set subtype
583
+ yield MetadataChangeProposalWrapper(
584
+ entityUrn=dashboard_urn,
585
+ aspect=SubTypesClass(typeNames=[BIContainerSubTypes.SIGMA_WORKBOOK]),
586
+ ).as_workunit()
587
+
588
+ # Ownership
589
+ owner_urn = (
590
+ builder.make_user_urn(owner_username)
591
+ if self.config.ingest_owner and owner_username
592
+ else None
593
+ )
594
+ if owner_urn:
595
+ yield from add_owner_to_entity_wu(
596
+ entity_type="dashboard",
597
+ entity_urn=dashboard_urn,
598
+ owner_urn=owner_urn,
599
+ )
600
+
601
+ # Tags
602
+ tags = [workbook.badge] if workbook.badge else None
603
+ if tags:
604
+ yield from add_tags_to_entity_wu(
605
+ entity_type="dashboard",
606
+ entity_urn=dashboard_urn,
607
+ tags=sorted(tags),
608
+ )
558
609
 
559
610
  paths = workbook.path.split("/")[1:]
560
- if len(paths) > 0 and workbook.workspaceId:
611
+ if workbook.workspaceId:
561
612
  yield self._gen_entity_browsepath_aspect(
562
- entity_urn=builder.make_container_urn(workbook_key),
613
+ entity_urn=dashboard_urn,
563
614
  parent_entity_urn=builder.make_container_urn(
564
615
  self._gen_workspace_key(workbook.workspaceId)
565
616
  ),
566
- paths=paths,
617
+ paths=paths + [workbook.name],
567
618
  )
568
619
 
569
- yield from self._gen_pages_workunit(workbook)
620
+ if len(paths) == 0:
621
+ yield from add_entity_to_container(
622
+ container_key=self._gen_workspace_key(workbook.workspaceId),
623
+ entity_type="dashboard",
624
+ entity_urn=dashboard_urn,
625
+ )
626
+
627
+ yield from self._gen_pages_workunit(workbook, paths)
570
628
 
571
629
  def _gen_sigma_dataset_upstream_lineage_workunit(
572
630
  self,
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
- from typing import Dict, List, Optional, Set, cast
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
7
  from pydantic import Field, SecretStr, root_validator, validator
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
16
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
17
17
  from datahub.configuration.validate_field_removal import pydantic_removed_field
18
18
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
19
+ from datahub.ingestion.api.incremental_properties_helper import (
20
+ IncrementalPropertiesConfigMixin,
21
+ )
19
22
  from datahub.ingestion.glossary.classification_mixin import (
20
23
  ClassificationSourceConfigMixin,
21
24
  )
@@ -118,9 +121,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
118
121
  )
119
122
 
120
123
  # Always exclude reporting metadata for INFORMATION_SCHEMA schema
121
- if schema_pattern is not None and schema_pattern:
124
+ if schema_pattern:
122
125
  logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
123
- cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
126
+ assert isinstance(schema_pattern, AllowDenyPattern)
127
+ schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
124
128
 
125
129
  return values
126
130
 
@@ -187,6 +191,7 @@ class SnowflakeV2Config(
187
191
  StatefulUsageConfigMixin,
188
192
  StatefulProfilingConfigMixin,
189
193
  ClassificationSourceConfigMixin,
194
+ IncrementalPropertiesConfigMixin,
190
195
  ):
191
196
  include_usage_stats: bool = Field(
192
197
  default=True,