acryl-datahub 1.2.0.11rc4__py3-none-any.whl → 1.2.0.11rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (42) hide show
  1. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/METADATA +2622 -2619
  2. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/RECORD +42 -40
  3. datahub/_version.py +1 -1
  4. datahub/cli/docker_check.py +1 -1
  5. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +296 -0
  6. datahub/ingestion/api/source.py +29 -5
  7. datahub/ingestion/api/source_protocols.py +23 -0
  8. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  9. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
  10. datahub/ingestion/source/cassandra/cassandra_profiling.py +2 -2
  11. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  12. datahub/ingestion/source/dremio/dremio_reporting.py +0 -2
  13. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  14. datahub/ingestion/source/fivetran/config.py +32 -5
  15. datahub/ingestion/source/fivetran/fivetran.py +0 -1
  16. datahub/ingestion/source/fivetran/fivetran_log_api.py +13 -0
  17. datahub/ingestion/source/fivetran/fivetran_query.py +43 -28
  18. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  19. datahub/ingestion/source/grafana/models.py +9 -1
  20. datahub/ingestion/source/grafana/report.py +1 -2
  21. datahub/ingestion/source/hex/hex.py +0 -2
  22. datahub/ingestion/source/redshift/redshift.py +2 -2
  23. datahub/ingestion/source/redshift/report.py +0 -2
  24. datahub/ingestion/source/snowflake/snowflake_report.py +0 -2
  25. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +2 -2
  26. datahub/ingestion/source/sql/oracle.py +1 -1
  27. datahub/ingestion/source/sql/sql_common.py +25 -17
  28. datahub/ingestion/source/sql/teradata.py +1 -2
  29. datahub/ingestion/source/sql_queries.py +1 -2
  30. datahub/ingestion/source/tableau/tableau.py +0 -2
  31. datahub/ingestion/source/unity/config.py +49 -29
  32. datahub/ingestion/source/unity/report.py +1 -2
  33. datahub/ingestion/source_report/ingestion_stage.py +54 -12
  34. datahub/metadata/_internal_schema_classes.py +169 -0
  35. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  36. datahub/metadata/schema.avsc +101 -0
  37. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  38. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  39. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/WHEEL +0 -0
  40. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/entry_points.txt +0 -0
  41. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/licenses/LICENSE +0 -0
  42. {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.2.0.11rc5.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,6 @@ from typing_extensions import LiteralString, Self
25
25
 
26
26
  from datahub.configuration.common import ConfigModel
27
27
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
28
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
29
28
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
30
29
  auto_patch_last_modified,
31
30
  )
@@ -46,8 +45,15 @@ from datahub.ingestion.api.source_helpers import (
46
45
  auto_workunit,
47
46
  auto_workunit_reporter,
48
47
  )
48
+ from datahub.ingestion.api.source_protocols import (
49
+ MetadataWorkUnitIterable,
50
+ ProfilingCapable,
51
+ )
49
52
  from datahub.ingestion.api.workunit import MetadataWorkUnit
50
- from datahub.sdk.entity import Entity
53
+ from datahub.ingestion.source_report.ingestion_stage import (
54
+ IngestionHighStage,
55
+ IngestionStageReport,
56
+ )
51
57
  from datahub.telemetry import stats
52
58
  from datahub.utilities.lossy_collections import LossyDict, LossyList
53
59
  from datahub.utilities.type_annotations import get_class_from_annotation
@@ -205,7 +211,7 @@ class StructuredLogs(Report):
205
211
 
206
212
 
207
213
  @dataclass
208
- class SourceReport(ExamplesReport):
214
+ class SourceReport(ExamplesReport, IngestionStageReport):
209
215
  event_not_produced_warn: bool = True
210
216
  events_produced: int = 0
211
217
  events_produced_per_sec: int = 0
@@ -553,13 +559,31 @@ class Source(Closeable, metaclass=ABCMeta):
553
559
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
554
560
  workunit_processors = self.get_workunit_processors()
555
561
  workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
556
- return self._apply_workunit_processors(
562
+ # Process main workunits
563
+ yield from self._apply_workunit_processors(
557
564
  workunit_processors, auto_workunit(self.get_workunits_internal())
558
565
  )
566
+ # Process profiling workunits
567
+ yield from self._process_profiling_stage(workunit_processors)
568
+
569
+ def _process_profiling_stage(
570
+ self, processors: List[Optional[MetadataWorkUnitProcessor]]
571
+ ) -> Iterable[MetadataWorkUnit]:
572
+ """Process profiling stage if source supports it."""
573
+ if (
574
+ not isinstance(self, ProfilingCapable)
575
+ or not self.is_profiling_enabled_internal()
576
+ ):
577
+ return
578
+ with self.get_report().new_high_stage(IngestionHighStage.PROFILING):
579
+ profiling_stream = self._apply_workunit_processors(
580
+ processors, auto_workunit(self.get_profiling_internal())
581
+ )
582
+ yield from profiling_stream
559
583
 
560
584
  def get_workunits_internal(
561
585
  self,
562
- ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
586
+ ) -> MetadataWorkUnitIterable:
563
587
  raise NotImplementedError(
564
588
  "get_workunits_internal must be implemented if get_workunits is not overriden."
565
589
  )
@@ -0,0 +1,23 @@
1
+ from typing import Iterable, Protocol, Union, runtime_checkable
2
+
3
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
4
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
5
+ from datahub.sdk.entity import Entity
6
+
7
+ # Type alias for metadata work units - Python 3.9 compatible
8
+ MetadataWorkUnitIterable = Iterable[
9
+ Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]
10
+ ]
11
+
12
+
13
+ @runtime_checkable
14
+ class ProfilingCapable(Protocol):
15
+ """Protocol for sources that support profiling functionality."""
16
+
17
+ def is_profiling_enabled_internal(self) -> bool:
18
+ """Check if profiling is enabled for this source."""
19
+ ...
20
+
21
+ def get_profiling_internal(self) -> MetadataWorkUnitIterable:
22
+ """Generate profiling work units."""
23
+ ...
@@ -9,7 +9,6 @@ import pydantic
9
9
  from datahub.ingestion.api.report import Report
10
10
  from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
11
11
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
12
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
12
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
13
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
14
  from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
78
77
  @dataclass
79
78
  class BigQueryV2Report(
80
79
  SQLSourceReport,
81
- IngestionStageReport,
82
80
  BaseTimeWindowReport,
83
81
  ClassificationReportMixin,
84
82
  ):
@@ -66,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
66
66
  )
67
67
  from datahub.ingestion.source_report.ingestion_stage import (
68
68
  METADATA_EXTRACTION,
69
- PROFILING,
69
+ IngestionHighStage,
70
70
  )
71
71
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
72
72
  Status,
@@ -416,7 +416,7 @@ class BigQuerySchemaGenerator:
416
416
 
417
417
  if self.config.is_profiling_enabled():
418
418
  logger.info(f"Starting profiling project {project_id}")
419
- with self.report.new_stage(f"{project_id}: {PROFILING}"):
419
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
420
420
  yield from self.profiler.get_workunits(
421
421
  project_id=project_id,
422
422
  tables=db_tables,
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
18
18
  )
19
19
  from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
20
20
  from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
21
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
21
+ from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
22
22
  from datahub.metadata.schema_classes import (
23
23
  DatasetFieldProfileClass,
24
24
  DatasetProfileClass,
@@ -71,7 +71,7 @@ class CassandraProfiler:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
73
  with (
74
- self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
74
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
75
75
  ThreadPoolExecutor(
76
76
  max_workers=self.config.profiling.max_workers
77
77
  ) as executor,
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
6
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
7
7
  StaleEntityRemovalSourceReport,
8
8
  )
9
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
9
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
11
10
  SchemaField,
12
11
  SchemaFieldDataType,
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
35
34
 
36
35
 
37
36
  @dataclass
38
- class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport):
37
+ class CassandraSourceReport(StaleEntityRemovalSourceReport):
39
38
  num_tables_failed: int = 0
40
39
  num_views_failed: int = 0
41
40
  tables_scanned: int = 0
@@ -6,7 +6,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
6
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
7
7
  StaleEntityRemovalSourceReport,
8
8
  )
9
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
9
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
10
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
11
  from datahub.utilities.stats_collections import (
@@ -20,7 +19,6 @@ from datahub.utilities.stats_collections import (
20
19
  class DremioSourceReport(
21
20
  SQLSourceReport,
22
21
  StaleEntityRemovalSourceReport,
23
- IngestionStageReport,
24
22
  BaseTimeWindowReport,
25
23
  ):
26
24
  num_containers_failed: int = 0
@@ -55,7 +55,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
55
55
  from datahub.ingestion.source_report.ingestion_stage import (
56
56
  LINEAGE_EXTRACTION,
57
57
  METADATA_EXTRACTION,
58
- PROFILING,
58
+ IngestionHighStage,
59
59
  )
60
60
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
61
61
  DatasetLineageTypeClass,
@@ -283,7 +283,7 @@ class DremioSource(StatefulIngestionSourceBase):
283
283
  # Profiling
284
284
  if self.config.is_profiling_enabled():
285
285
  with (
286
- self.report.new_stage(PROFILING),
286
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
287
287
  ThreadPoolExecutor(
288
288
  max_workers=self.config.profiling.max_workers
289
289
  ) as executor,
@@ -29,6 +29,9 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  from datahub.ingestion.source.state.stateful_ingestion_base import (
30
30
  StatefulIngestionConfigBase,
31
31
  )
32
+ from datahub.ingestion.source.unity.config import (
33
+ UnityCatalogConnectionConfig,
34
+ )
32
35
  from datahub.utilities.lossy_collections import LossyList
33
36
  from datahub.utilities.perf_timer import PerfTimer
34
37
 
@@ -56,8 +59,8 @@ class Constant:
56
59
  STATUS = "status"
57
60
  USER_ID = "user_id"
58
61
  EMAIL = "email"
59
- CONNECTOR_ID = "connector_id"
60
- CONNECTOR_NAME = "connector_name"
62
+ CONNECTOR_ID = "connection_id"
63
+ CONNECTOR_NAME = "connection_name"
61
64
  CONNECTOR_TYPE_ID = "connector_type_id"
62
65
  PAUSED = "paused"
63
66
  SYNC_FREQUENCY = "sync_frequency"
@@ -85,10 +88,23 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
85
88
  dataset: str = Field(description="The fivetran connector log dataset.")
86
89
 
87
90
 
91
+ class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
92
+ catalog: str = Field(description="The fivetran connector log catalog.")
93
+ log_schema: str = Field(description="The fivetran connector log schema.")
94
+
95
+ @pydantic.validator("warehouse_id")
96
+ def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
97
+ if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
98
+ raise ValueError("Fivetran requires warehouse_id to be set")
99
+ return warehouse_id
100
+
101
+
88
102
  class FivetranLogConfig(ConfigModel):
89
- destination_platform: Literal["snowflake", "bigquery"] = pydantic.Field(
90
- default="snowflake",
91
- description="The destination platform where fivetran connector log tables are dumped.",
103
+ destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
104
+ pydantic.Field(
105
+ default="snowflake",
106
+ description="The destination platform where fivetran connector log tables are dumped.",
107
+ )
92
108
  )
93
109
  snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
94
110
  default=None,
@@ -98,6 +114,12 @@ class FivetranLogConfig(ConfigModel):
98
114
  default=None,
99
115
  description="If destination platform is 'bigquery', provide bigquery configuration.",
100
116
  )
117
+ databricks_destination_config: Optional[DatabricksDestinationConfig] = (
118
+ pydantic.Field(
119
+ default=None,
120
+ description="If destination platform is 'databricks', provide databricks configuration.",
121
+ )
122
+ )
101
123
  _rename_destination_config = pydantic_renamed_field(
102
124
  "destination_config", "snowflake_destination_config"
103
125
  )
@@ -115,6 +137,11 @@ class FivetranLogConfig(ConfigModel):
115
137
  raise ValueError(
116
138
  "If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
117
139
  )
140
+ elif destination_platform == "databricks":
141
+ if "databricks_destination_config" not in values:
142
+ raise ValueError(
143
+ "If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
144
+ )
118
145
  else:
119
146
  raise ValueError(
120
147
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -66,7 +66,6 @@ logger = logging.getLogger(__name__)
66
66
  class FivetranSource(StatefulIngestionSourceBase):
67
67
  """
68
68
  This plugin extracts fivetran users, connectors, destinations and sync history.
69
- This plugin is in beta and has only been tested on Snowflake connector.
70
69
  """
71
70
 
72
71
  config: FivetranSourceConfig
@@ -73,6 +73,19 @@ class FivetranLogAPI:
73
73
  if result is None:
74
74
  raise ValueError("Failed to retrieve BigQuery project ID")
75
75
  fivetran_log_database = result[0]
76
+ elif destination_platform == "databricks":
77
+ databricks_destination_config = (
78
+ self.fivetran_log_config.databricks_destination_config
79
+ )
80
+ if databricks_destination_config is not None:
81
+ engine = create_engine(
82
+ databricks_destination_config.get_sql_alchemy_url(
83
+ databricks_destination_config.catalog
84
+ ),
85
+ **databricks_destination_config.get_options(),
86
+ )
87
+ fivetran_log_query.set_schema(databricks_destination_config.log_schema)
88
+ fivetran_log_database = databricks_destination_config.catalog
76
89
  else:
77
90
  raise ConfigurationError(
78
91
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -6,6 +6,21 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
6
6
  MAX_JOBS_PER_CONNECTOR = 500
7
7
 
8
8
 
9
+ """
10
+ ------------------------------------------------------------------------------------------------------------
11
+ Fivetran Platform Connector Handling
12
+ ------------------------------------------------------------------------------------------------------------
13
+ Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
14
+
15
+ All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
16
+ and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
17
+
18
+ References:
19
+ - Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
20
+ - Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
21
+ """
22
+
23
+
9
24
  class FivetranLogQuery:
10
25
  # Note: All queries are written in Snowflake SQL.
11
26
  # They will be transpiled to the target database's SQL dialect at runtime.
@@ -30,17 +45,17 @@ class FivetranLogQuery:
30
45
  def get_connectors_query(self) -> str:
31
46
  return f"""\
32
47
  SELECT
33
- connector_id,
48
+ connection_id,
34
49
  connecting_user_id,
35
50
  connector_type_id,
36
- connector_name,
51
+ connection_name,
37
52
  paused,
38
53
  sync_frequency,
39
54
  destination_id
40
- FROM {self.schema_clause}connector
55
+ FROM {self.schema_clause}connection
41
56
  WHERE
42
57
  _fivetran_deleted = FALSE
43
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
58
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
44
59
  """
45
60
 
46
61
  def get_users_query(self) -> str:
@@ -63,20 +78,20 @@ FROM {self.schema_clause}user
63
78
  return f"""\
64
79
  WITH ranked_syncs AS (
65
80
  SELECT
66
- connector_id,
81
+ connection_id,
67
82
  sync_id,
68
83
  MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
69
84
  MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
70
85
  MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
71
- ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
86
+ ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
72
87
  FROM {self.schema_clause}log
73
88
  WHERE message_event in ('sync_start', 'sync_end')
74
89
  AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
75
- AND connector_id IN ({formatted_connector_ids})
76
- GROUP BY connector_id, sync_id
90
+ AND connection_id IN ({formatted_connector_ids})
91
+ GROUP BY connection_id, sync_id
77
92
  )
78
93
  SELECT
79
- connector_id,
94
+ connection_id,
80
95
  sync_id,
81
96
  start_time,
82
97
  end_time,
@@ -85,7 +100,7 @@ FROM ranked_syncs
85
100
  WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
86
101
  AND start_time IS NOT NULL
87
102
  AND end_time IS NOT NULL
88
- ORDER BY connector_id, end_time DESC
103
+ ORDER BY connection_id, end_time DESC
89
104
  """
90
105
 
91
106
  def get_table_lineage_query(self, connector_ids: List[str]) -> str:
@@ -97,7 +112,7 @@ SELECT
97
112
  *
98
113
  FROM (
99
114
  SELECT
100
- stm.connector_id as connector_id,
115
+ stm.connection_id as connection_id,
101
116
  stm.id as source_table_id,
102
117
  stm.name as source_table_name,
103
118
  ssm.name as source_schema_name,
@@ -105,18 +120,18 @@ FROM (
105
120
  dtm.name as destination_table_name,
106
121
  dsm.name as destination_schema_name,
107
122
  tl.created_at as created_at,
108
- ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
123
+ ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
109
124
  FROM {self.schema_clause}table_lineage as tl
110
- JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
111
- JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
112
- JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
113
- JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
114
- WHERE stm.connector_id IN ({formatted_connector_ids})
125
+ JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
126
+ JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
127
+ JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
128
+ JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
129
+ WHERE stm.connection_id IN ({formatted_connector_ids})
115
130
  )
116
131
  -- Ensure that we only get back one entry per source and destination pair.
117
132
  WHERE table_combo_rn = 1
118
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
119
- ORDER BY connector_id, created_at DESC
133
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
134
+ ORDER BY connection_id, created_at DESC
120
135
  """
121
136
 
122
137
  def get_column_lineage_query(self, connector_ids: List[str]) -> str:
@@ -131,25 +146,25 @@ SELECT
131
146
  destination_column_name
132
147
  FROM (
133
148
  SELECT
134
- stm.connector_id as connector_id,
149
+ stm.connection_id as connection_id,
135
150
  scm.table_id as source_table_id,
136
151
  dcm.table_id as destination_table_id,
137
152
  scm.name as source_column_name,
138
153
  dcm.name as destination_column_name,
139
154
  cl.created_at as created_at,
140
- ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
155
+ ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
141
156
  FROM {self.schema_clause}column_lineage as cl
142
- JOIN {self.schema_clause}source_column_metadata as scm
157
+ JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
143
158
  ON cl.source_column_id = scm.id
144
- JOIN {self.schema_clause}destination_column_metadata as dcm
159
+ JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
145
160
  ON cl.destination_column_id = dcm.id
146
- -- Only joining source_table_metadata to get the connector_id.
147
- JOIN {self.schema_clause}source_table_metadata as stm
161
+ -- Only joining source_table to get the connection_id.
162
+ JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
148
163
  ON scm.table_id = stm.id
149
- WHERE stm.connector_id IN ({formatted_connector_ids})
164
+ WHERE stm.connection_id IN ({formatted_connector_ids})
150
165
  )
151
166
  -- Ensure that we only get back one entry per (connector, source column, destination column) pair.
152
167
  WHERE column_combo_rn = 1
153
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
154
- ORDER BY connector_id, created_at DESC
168
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
169
+ ORDER BY connection_id, created_at DESC
155
170
  """
@@ -34,7 +34,6 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
38
37
 
39
38
  logger = logging.getLogger(__name__)
40
39
 
@@ -87,7 +86,6 @@ class DataHubGcSourceReport(
87
86
  DataProcessCleanupReport,
88
87
  SoftDeletedEntitiesReport,
89
88
  DatahubExecutionRequestCleanupReport,
90
- IngestionStageReport,
91
89
  ):
92
90
  expired_tokens_revoked: int = 0
93
91
 
@@ -8,12 +8,14 @@ References:
8
8
  - Dashboard JSON structure: https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/view-dashboard-json-model/
9
9
  """
10
10
 
11
+ import logging
11
12
  from typing import Any, Dict, List, Optional
12
13
 
13
14
  from pydantic import BaseModel, ConfigDict, Field
14
15
 
15
16
  from datahub.emitter.mcp_builder import ContainerKey
16
17
 
18
+ logger = logging.getLogger(__name__)
17
19
  # Grafana-specific type definitions for better type safety
18
20
  GrafanaQueryTarget = Dict[
19
21
  str, Any
@@ -89,7 +91,13 @@ class Dashboard(_GrafanaBaseModel):
89
91
  def parse_obj(cls, data: Dict[str, Any]) -> "Dashboard":
90
92
  """Custom parsing to handle nested panel extraction."""
91
93
  dashboard_data = data.get("dashboard", {})
92
- panels = cls.extract_panels(dashboard_data.get("panels", []))
94
+ _panel_data = dashboard_data.get("panels", [])
95
+ try:
96
+ panels = cls.extract_panels(_panel_data)
97
+ except Exception as e:
98
+ logger.warning(
99
+ f"Error extracting panels from dashboard for dashboard panels {_panel_data} : {e}"
100
+ )
93
101
 
94
102
  # Extract meta.folderId from nested structure
95
103
  meta = dashboard_data.get("meta", {})
@@ -3,11 +3,10 @@ from dataclasses import dataclass
3
3
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
4
4
  StaleEntityRemovalSourceReport,
5
5
  )
6
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
7
6
 
8
7
 
9
8
  @dataclass
10
- class GrafanaSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport):
9
+ class GrafanaSourceReport(StaleEntityRemovalSourceReport):
11
10
  # Entity counters
12
11
  dashboards_scanned: int = 0
13
12
  charts_scanned: int = 0
@@ -46,7 +46,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
46
46
  StatefulIngestionConfigBase,
47
47
  StatefulIngestionSourceBase,
48
48
  )
49
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
50
49
  from datahub.sdk.main_client import DataHubClient
51
50
 
52
51
 
@@ -172,7 +171,6 @@ class HexSourceConfig(
172
171
  class HexReport(
173
172
  StaleEntityRemovalSourceReport,
174
173
  HexApiReport,
175
- IngestionStageReport,
176
174
  HexQueryFetcherReport,
177
175
  ):
178
176
  pass
@@ -89,8 +89,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
89
89
  from datahub.ingestion.source_report.ingestion_stage import (
90
90
  LINEAGE_EXTRACTION,
91
91
  METADATA_EXTRACTION,
92
- PROFILING,
93
92
  USAGE_EXTRACTION_INGESTION,
93
+ IngestionHighStage,
94
94
  )
95
95
  from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes, TimeStamp
96
96
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
@@ -446,7 +446,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
446
446
  )
447
447
 
448
448
  if self.config.is_profiling_enabled():
449
- with self.report.new_stage(PROFILING):
449
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
450
450
  profiler = RedshiftProfiler(
451
451
  config=self.config,
452
452
  report=self.report,
@@ -4,7 +4,6 @@ from typing import Dict, Optional
4
4
 
5
5
  from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
6
6
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
7
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
8
7
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
9
8
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
10
9
  from datahub.utilities.lossy_collections import LossyDict
@@ -15,7 +14,6 @@ from datahub.utilities.stats_collections import TopKDict
15
14
  @dataclass
16
15
  class RedshiftReport(
17
16
  SQLSourceReport,
18
- IngestionStageReport,
19
17
  BaseTimeWindowReport,
20
18
  ClassificationReportMixin,
21
19
  ):
@@ -9,7 +9,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
9
9
  from datahub.ingestion.source.state.stateful_ingestion_base import (
10
10
  StatefulIngestionReport,
11
11
  )
12
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
12
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
13
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
14
  from datahub.utilities.lossy_collections import LossyDict
@@ -96,7 +95,6 @@ class SnowflakeV2Report(
96
95
  SnowflakeUsageReport,
97
96
  StatefulIngestionReport,
98
97
  ClassificationReportMixin,
99
- IngestionStageReport,
100
98
  ):
101
99
  account_locator: Optional[str] = None
102
100
  region: Optional[str] = None
@@ -77,7 +77,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
77
77
  EXTERNAL_TABLE_DDL_LINEAGE,
78
78
  LINEAGE_EXTRACTION,
79
79
  METADATA_EXTRACTION,
80
- PROFILING,
80
+ IngestionHighStage,
81
81
  )
82
82
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
83
83
  GlobalTags,
@@ -360,7 +360,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
360
360
  yield from self._process_db_schemas(snowflake_db, db_tables)
361
361
 
362
362
  if self.profiler and db_tables:
363
- with self.report.new_stage(f"{snowflake_db.name}: {PROFILING}"):
363
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
364
364
  yield from self.profiler.get_workunits(snowflake_db, db_tables)
365
365
 
366
366
  def _process_db_schemas(
@@ -37,7 +37,7 @@ from datahub.ingestion.source.sql.sql_config import (
37
37
 
38
38
  logger = logging.getLogger(__name__)
39
39
 
40
- oracledb.version = "8.3.0"
40
+ oracledb.version = "8.3.0" # type: ignore[assignment]
41
41
  sys.modules["cx_Oracle"] = oracledb
42
42
 
43
43
  extra_oracle_types = {