acryl-datahub 1.2.0.11rc4__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/METADATA +2582 -2577
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/RECORD +43 -40
- datahub/_version.py +1 -1
- datahub/cli/docker_check.py +1 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +296 -0
- datahub/ingestion/api/source.py +29 -5
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_profiling.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -2
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/fivetran/config.py +30 -5
- datahub/ingestion/source/fivetran/fivetran.py +0 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +13 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +43 -28
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/grafana/models.py +9 -1
- datahub/ingestion/source/grafana/report.py +1 -2
- datahub/ingestion/source/hex/hex.py +0 -2
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -2
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +2 -2
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +25 -17
- datahub/ingestion/source/sql/teradata.py +1 -2
- datahub/ingestion/source/sql_queries.py +1 -2
- datahub/ingestion/source/tableau/tableau.py +0 -2
- datahub/ingestion/source/unity/config.py +11 -42
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/report.py +1 -2
- datahub/ingestion/source_report/ingestion_stage.py +54 -12
- datahub/metadata/_internal_schema_classes.py +169 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/schema.avsc +101 -0
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/source.py
CHANGED
|
@@ -25,7 +25,6 @@ from typing_extensions import LiteralString, Self
|
|
|
25
25
|
|
|
26
26
|
from datahub.configuration.common import ConfigModel
|
|
27
27
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
28
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
29
28
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
30
29
|
auto_patch_last_modified,
|
|
31
30
|
)
|
|
@@ -46,8 +45,15 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
46
45
|
auto_workunit,
|
|
47
46
|
auto_workunit_reporter,
|
|
48
47
|
)
|
|
48
|
+
from datahub.ingestion.api.source_protocols import (
|
|
49
|
+
MetadataWorkUnitIterable,
|
|
50
|
+
ProfilingCapable,
|
|
51
|
+
)
|
|
49
52
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
50
|
-
from datahub.
|
|
53
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
54
|
+
IngestionHighStage,
|
|
55
|
+
IngestionStageReport,
|
|
56
|
+
)
|
|
51
57
|
from datahub.telemetry import stats
|
|
52
58
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
53
59
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
@@ -205,7 +211,7 @@ class StructuredLogs(Report):
|
|
|
205
211
|
|
|
206
212
|
|
|
207
213
|
@dataclass
|
|
208
|
-
class SourceReport(ExamplesReport):
|
|
214
|
+
class SourceReport(ExamplesReport, IngestionStageReport):
|
|
209
215
|
event_not_produced_warn: bool = True
|
|
210
216
|
events_produced: int = 0
|
|
211
217
|
events_produced_per_sec: int = 0
|
|
@@ -553,13 +559,31 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
553
559
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
554
560
|
workunit_processors = self.get_workunit_processors()
|
|
555
561
|
workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
|
|
556
|
-
|
|
562
|
+
# Process main workunits
|
|
563
|
+
yield from self._apply_workunit_processors(
|
|
557
564
|
workunit_processors, auto_workunit(self.get_workunits_internal())
|
|
558
565
|
)
|
|
566
|
+
# Process profiling workunits
|
|
567
|
+
yield from self._process_profiling_stage(workunit_processors)
|
|
568
|
+
|
|
569
|
+
def _process_profiling_stage(
|
|
570
|
+
self, processors: List[Optional[MetadataWorkUnitProcessor]]
|
|
571
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
572
|
+
"""Process profiling stage if source supports it."""
|
|
573
|
+
if (
|
|
574
|
+
not isinstance(self, ProfilingCapable)
|
|
575
|
+
or not self.is_profiling_enabled_internal()
|
|
576
|
+
):
|
|
577
|
+
return
|
|
578
|
+
with self.get_report().new_high_stage(IngestionHighStage.PROFILING):
|
|
579
|
+
profiling_stream = self._apply_workunit_processors(
|
|
580
|
+
processors, auto_workunit(self.get_profiling_internal())
|
|
581
|
+
)
|
|
582
|
+
yield from profiling_stream
|
|
559
583
|
|
|
560
584
|
def get_workunits_internal(
|
|
561
585
|
self,
|
|
562
|
-
) ->
|
|
586
|
+
) -> MetadataWorkUnitIterable:
|
|
563
587
|
raise NotImplementedError(
|
|
564
588
|
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
|
565
589
|
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Iterable, Protocol, Union, runtime_checkable
|
|
2
|
+
|
|
3
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
4
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
5
|
+
from datahub.sdk.entity import Entity
|
|
6
|
+
|
|
7
|
+
# Type alias for metadata work units - Python 3.9 compatible
|
|
8
|
+
MetadataWorkUnitIterable = Iterable[
|
|
9
|
+
Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@runtime_checkable
|
|
14
|
+
class ProfilingCapable(Protocol):
|
|
15
|
+
"""Protocol for sources that support profiling functionality."""
|
|
16
|
+
|
|
17
|
+
def is_profiling_enabled_internal(self) -> bool:
|
|
18
|
+
"""Check if profiling is enabled for this source."""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
def get_profiling_internal(self) -> MetadataWorkUnitIterable:
|
|
22
|
+
"""Generate profiling work units."""
|
|
23
|
+
...
|
|
@@ -9,7 +9,6 @@ import pydantic
|
|
|
9
9
|
from datahub.ingestion.api.report import Report
|
|
10
10
|
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
11
11
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
12
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
12
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
13
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
14
|
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
|
|
|
78
77
|
@dataclass
|
|
79
78
|
class BigQueryV2Report(
|
|
80
79
|
SQLSourceReport,
|
|
81
|
-
IngestionStageReport,
|
|
82
80
|
BaseTimeWindowReport,
|
|
83
81
|
ClassificationReportMixin,
|
|
84
82
|
):
|
|
@@ -66,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
66
66
|
)
|
|
67
67
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
68
68
|
METADATA_EXTRACTION,
|
|
69
|
-
|
|
69
|
+
IngestionHighStage,
|
|
70
70
|
)
|
|
71
71
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
72
72
|
Status,
|
|
@@ -416,7 +416,7 @@ class BigQuerySchemaGenerator:
|
|
|
416
416
|
|
|
417
417
|
if self.config.is_profiling_enabled():
|
|
418
418
|
logger.info(f"Starting profiling project {project_id}")
|
|
419
|
-
with self.report.
|
|
419
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
420
420
|
yield from self.profiler.get_workunits(
|
|
421
421
|
project_id=project_id,
|
|
422
422
|
tables=db_tables,
|
|
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
|
|
20
20
|
from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
|
|
21
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
21
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
|
|
22
22
|
from datahub.metadata.schema_classes import (
|
|
23
23
|
DatasetFieldProfileClass,
|
|
24
24
|
DatasetProfileClass,
|
|
@@ -71,7 +71,7 @@ class CassandraProfiler:
|
|
|
71
71
|
for keyspace_name in cassandra_data.keyspaces:
|
|
72
72
|
tables = cassandra_data.tables.get(keyspace_name, [])
|
|
73
73
|
with (
|
|
74
|
-
self.report.
|
|
74
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
75
75
|
ThreadPoolExecutor(
|
|
76
76
|
max_workers=self.config.profiling.max_workers
|
|
77
77
|
) as executor,
|
|
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
|
|
|
6
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
7
7
|
StaleEntityRemovalSourceReport,
|
|
8
8
|
)
|
|
9
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
9
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
11
10
|
SchemaField,
|
|
12
11
|
SchemaFieldDataType,
|
|
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
|
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
@dataclass
|
|
38
|
-
class CassandraSourceReport(StaleEntityRemovalSourceReport
|
|
37
|
+
class CassandraSourceReport(StaleEntityRemovalSourceReport):
|
|
39
38
|
num_tables_failed: int = 0
|
|
40
39
|
num_views_failed: int = 0
|
|
41
40
|
tables_scanned: int = 0
|
|
@@ -6,7 +6,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
|
6
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
7
7
|
StaleEntityRemovalSourceReport,
|
|
8
8
|
)
|
|
9
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
9
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
11
10
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
12
11
|
from datahub.utilities.stats_collections import (
|
|
@@ -20,7 +19,6 @@ from datahub.utilities.stats_collections import (
|
|
|
20
19
|
class DremioSourceReport(
|
|
21
20
|
SQLSourceReport,
|
|
22
21
|
StaleEntityRemovalSourceReport,
|
|
23
|
-
IngestionStageReport,
|
|
24
22
|
BaseTimeWindowReport,
|
|
25
23
|
):
|
|
26
24
|
num_containers_failed: int = 0
|
|
@@ -55,7 +55,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
55
55
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
56
56
|
LINEAGE_EXTRACTION,
|
|
57
57
|
METADATA_EXTRACTION,
|
|
58
|
-
|
|
58
|
+
IngestionHighStage,
|
|
59
59
|
)
|
|
60
60
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
61
61
|
DatasetLineageTypeClass,
|
|
@@ -283,7 +283,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
283
283
|
# Profiling
|
|
284
284
|
if self.config.is_profiling_enabled():
|
|
285
285
|
with (
|
|
286
|
-
self.report.
|
|
286
|
+
self.report.new_high_stage(IngestionHighStage.PROFILING),
|
|
287
287
|
ThreadPoolExecutor(
|
|
288
288
|
max_workers=self.config.profiling.max_workers
|
|
289
289
|
) as executor,
|
|
@@ -29,6 +29,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
29
29
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
30
30
|
StatefulIngestionConfigBase,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
|
|
32
33
|
from datahub.utilities.lossy_collections import LossyList
|
|
33
34
|
from datahub.utilities.perf_timer import PerfTimer
|
|
34
35
|
|
|
@@ -56,8 +57,8 @@ class Constant:
|
|
|
56
57
|
STATUS = "status"
|
|
57
58
|
USER_ID = "user_id"
|
|
58
59
|
EMAIL = "email"
|
|
59
|
-
CONNECTOR_ID = "
|
|
60
|
-
CONNECTOR_NAME = "
|
|
60
|
+
CONNECTOR_ID = "connection_id"
|
|
61
|
+
CONNECTOR_NAME = "connection_name"
|
|
61
62
|
CONNECTOR_TYPE_ID = "connector_type_id"
|
|
62
63
|
PAUSED = "paused"
|
|
63
64
|
SYNC_FREQUENCY = "sync_frequency"
|
|
@@ -85,10 +86,23 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
|
|
|
85
86
|
dataset: str = Field(description="The fivetran connector log dataset.")
|
|
86
87
|
|
|
87
88
|
|
|
89
|
+
class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
|
|
90
|
+
catalog: str = Field(description="The fivetran connector log catalog.")
|
|
91
|
+
log_schema: str = Field(description="The fivetran connector log schema.")
|
|
92
|
+
|
|
93
|
+
@pydantic.validator("warehouse_id")
|
|
94
|
+
def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
|
|
95
|
+
if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
|
|
96
|
+
raise ValueError("Fivetran requires warehouse_id to be set")
|
|
97
|
+
return warehouse_id
|
|
98
|
+
|
|
99
|
+
|
|
88
100
|
class FivetranLogConfig(ConfigModel):
|
|
89
|
-
destination_platform: Literal["snowflake", "bigquery"] =
|
|
90
|
-
|
|
91
|
-
|
|
101
|
+
destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
|
|
102
|
+
pydantic.Field(
|
|
103
|
+
default="snowflake",
|
|
104
|
+
description="The destination platform where fivetran connector log tables are dumped.",
|
|
105
|
+
)
|
|
92
106
|
)
|
|
93
107
|
snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
|
|
94
108
|
default=None,
|
|
@@ -98,6 +112,12 @@ class FivetranLogConfig(ConfigModel):
|
|
|
98
112
|
default=None,
|
|
99
113
|
description="If destination platform is 'bigquery', provide bigquery configuration.",
|
|
100
114
|
)
|
|
115
|
+
databricks_destination_config: Optional[DatabricksDestinationConfig] = (
|
|
116
|
+
pydantic.Field(
|
|
117
|
+
default=None,
|
|
118
|
+
description="If destination platform is 'databricks', provide databricks configuration.",
|
|
119
|
+
)
|
|
120
|
+
)
|
|
101
121
|
_rename_destination_config = pydantic_renamed_field(
|
|
102
122
|
"destination_config", "snowflake_destination_config"
|
|
103
123
|
)
|
|
@@ -115,6 +135,11 @@ class FivetranLogConfig(ConfigModel):
|
|
|
115
135
|
raise ValueError(
|
|
116
136
|
"If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
|
|
117
137
|
)
|
|
138
|
+
elif destination_platform == "databricks":
|
|
139
|
+
if "databricks_destination_config" not in values:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
"If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
|
|
142
|
+
)
|
|
118
143
|
else:
|
|
119
144
|
raise ValueError(
|
|
120
145
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -66,7 +66,6 @@ logger = logging.getLogger(__name__)
|
|
|
66
66
|
class FivetranSource(StatefulIngestionSourceBase):
|
|
67
67
|
"""
|
|
68
68
|
This plugin extracts fivetran users, connectors, destinations and sync history.
|
|
69
|
-
This plugin is in beta and has only been tested on Snowflake connector.
|
|
70
69
|
"""
|
|
71
70
|
|
|
72
71
|
config: FivetranSourceConfig
|
|
@@ -73,6 +73,19 @@ class FivetranLogAPI:
|
|
|
73
73
|
if result is None:
|
|
74
74
|
raise ValueError("Failed to retrieve BigQuery project ID")
|
|
75
75
|
fivetran_log_database = result[0]
|
|
76
|
+
elif destination_platform == "databricks":
|
|
77
|
+
databricks_destination_config = (
|
|
78
|
+
self.fivetran_log_config.databricks_destination_config
|
|
79
|
+
)
|
|
80
|
+
if databricks_destination_config is not None:
|
|
81
|
+
engine = create_engine(
|
|
82
|
+
databricks_destination_config.get_sql_alchemy_url(
|
|
83
|
+
databricks_destination_config.catalog
|
|
84
|
+
),
|
|
85
|
+
**databricks_destination_config.get_options(),
|
|
86
|
+
)
|
|
87
|
+
fivetran_log_query.set_schema(databricks_destination_config.log_schema)
|
|
88
|
+
fivetran_log_database = databricks_destination_config.catalog
|
|
76
89
|
else:
|
|
77
90
|
raise ConfigurationError(
|
|
78
91
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -6,6 +6,21 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
|
|
|
6
6
|
MAX_JOBS_PER_CONNECTOR = 500
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
"""
|
|
10
|
+
------------------------------------------------------------------------------------------------------------
|
|
11
|
+
Fivetran Platform Connector Handling
|
|
12
|
+
------------------------------------------------------------------------------------------------------------
|
|
13
|
+
Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
|
|
14
|
+
|
|
15
|
+
All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
|
|
16
|
+
and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
- Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
|
|
20
|
+
- Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
9
24
|
class FivetranLogQuery:
|
|
10
25
|
# Note: All queries are written in Snowflake SQL.
|
|
11
26
|
# They will be transpiled to the target database's SQL dialect at runtime.
|
|
@@ -30,17 +45,17 @@ class FivetranLogQuery:
|
|
|
30
45
|
def get_connectors_query(self) -> str:
|
|
31
46
|
return f"""\
|
|
32
47
|
SELECT
|
|
33
|
-
|
|
48
|
+
connection_id,
|
|
34
49
|
connecting_user_id,
|
|
35
50
|
connector_type_id,
|
|
36
|
-
|
|
51
|
+
connection_name,
|
|
37
52
|
paused,
|
|
38
53
|
sync_frequency,
|
|
39
54
|
destination_id
|
|
40
|
-
FROM {self.schema_clause}
|
|
55
|
+
FROM {self.schema_clause}connection
|
|
41
56
|
WHERE
|
|
42
57
|
_fivetran_deleted = FALSE
|
|
43
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
58
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
|
|
44
59
|
"""
|
|
45
60
|
|
|
46
61
|
def get_users_query(self) -> str:
|
|
@@ -63,20 +78,20 @@ FROM {self.schema_clause}user
|
|
|
63
78
|
return f"""\
|
|
64
79
|
WITH ranked_syncs AS (
|
|
65
80
|
SELECT
|
|
66
|
-
|
|
81
|
+
connection_id,
|
|
67
82
|
sync_id,
|
|
68
83
|
MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
|
|
69
84
|
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
|
|
70
85
|
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
|
|
71
|
-
ROW_NUMBER() OVER (PARTITION BY
|
|
86
|
+
ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
|
|
72
87
|
FROM {self.schema_clause}log
|
|
73
88
|
WHERE message_event in ('sync_start', 'sync_end')
|
|
74
89
|
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
|
|
75
|
-
AND
|
|
76
|
-
GROUP BY
|
|
90
|
+
AND connection_id IN ({formatted_connector_ids})
|
|
91
|
+
GROUP BY connection_id, sync_id
|
|
77
92
|
)
|
|
78
93
|
SELECT
|
|
79
|
-
|
|
94
|
+
connection_id,
|
|
80
95
|
sync_id,
|
|
81
96
|
start_time,
|
|
82
97
|
end_time,
|
|
@@ -85,7 +100,7 @@ FROM ranked_syncs
|
|
|
85
100
|
WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
|
|
86
101
|
AND start_time IS NOT NULL
|
|
87
102
|
AND end_time IS NOT NULL
|
|
88
|
-
ORDER BY
|
|
103
|
+
ORDER BY connection_id, end_time DESC
|
|
89
104
|
"""
|
|
90
105
|
|
|
91
106
|
def get_table_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -97,7 +112,7 @@ SELECT
|
|
|
97
112
|
*
|
|
98
113
|
FROM (
|
|
99
114
|
SELECT
|
|
100
|
-
stm.
|
|
115
|
+
stm.connection_id as connection_id,
|
|
101
116
|
stm.id as source_table_id,
|
|
102
117
|
stm.name as source_table_name,
|
|
103
118
|
ssm.name as source_schema_name,
|
|
@@ -105,18 +120,18 @@ FROM (
|
|
|
105
120
|
dtm.name as destination_table_name,
|
|
106
121
|
dsm.name as destination_schema_name,
|
|
107
122
|
tl.created_at as created_at,
|
|
108
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
123
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
|
109
124
|
FROM {self.schema_clause}table_lineage as tl
|
|
110
|
-
JOIN {self.schema_clause}
|
|
111
|
-
JOIN {self.schema_clause}
|
|
112
|
-
JOIN {self.schema_clause}
|
|
113
|
-
JOIN {self.schema_clause}
|
|
114
|
-
WHERE stm.
|
|
125
|
+
JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
|
|
126
|
+
JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
|
|
127
|
+
JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
|
|
128
|
+
JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
|
|
129
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
115
130
|
)
|
|
116
131
|
-- Ensure that we only get back one entry per source and destination pair.
|
|
117
132
|
WHERE table_combo_rn = 1
|
|
118
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
119
|
-
ORDER BY
|
|
133
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
|
134
|
+
ORDER BY connection_id, created_at DESC
|
|
120
135
|
"""
|
|
121
136
|
|
|
122
137
|
def get_column_lineage_query(self, connector_ids: List[str]) -> str:
|
|
@@ -131,25 +146,25 @@ SELECT
|
|
|
131
146
|
destination_column_name
|
|
132
147
|
FROM (
|
|
133
148
|
SELECT
|
|
134
|
-
stm.
|
|
149
|
+
stm.connection_id as connection_id,
|
|
135
150
|
scm.table_id as source_table_id,
|
|
136
151
|
dcm.table_id as destination_table_id,
|
|
137
152
|
scm.name as source_column_name,
|
|
138
153
|
dcm.name as destination_column_name,
|
|
139
154
|
cl.created_at as created_at,
|
|
140
|
-
ROW_NUMBER() OVER (PARTITION BY stm.
|
|
155
|
+
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
|
141
156
|
FROM {self.schema_clause}column_lineage as cl
|
|
142
|
-
JOIN {self.schema_clause}
|
|
157
|
+
JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
|
|
143
158
|
ON cl.source_column_id = scm.id
|
|
144
|
-
JOIN {self.schema_clause}
|
|
159
|
+
JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
|
|
145
160
|
ON cl.destination_column_id = dcm.id
|
|
146
|
-
-- Only joining
|
|
147
|
-
JOIN {self.schema_clause}
|
|
161
|
+
-- Only joining source_table to get the connection_id.
|
|
162
|
+
JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
|
|
148
163
|
ON scm.table_id = stm.id
|
|
149
|
-
WHERE stm.
|
|
164
|
+
WHERE stm.connection_id IN ({formatted_connector_ids})
|
|
150
165
|
)
|
|
151
166
|
-- Ensure that we only get back one entry per (connector, source column, destination column) pair.
|
|
152
167
|
WHERE column_combo_rn = 1
|
|
153
|
-
QUALIFY ROW_NUMBER() OVER (PARTITION BY
|
|
154
|
-
ORDER BY
|
|
168
|
+
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
|
169
|
+
ORDER BY connection_id, created_at DESC
|
|
155
170
|
"""
|
|
@@ -34,7 +34,6 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
38
37
|
|
|
39
38
|
logger = logging.getLogger(__name__)
|
|
40
39
|
|
|
@@ -87,7 +86,6 @@ class DataHubGcSourceReport(
|
|
|
87
86
|
DataProcessCleanupReport,
|
|
88
87
|
SoftDeletedEntitiesReport,
|
|
89
88
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
-
IngestionStageReport,
|
|
91
89
|
):
|
|
92
90
|
expired_tokens_revoked: int = 0
|
|
93
91
|
|
|
@@ -8,12 +8,14 @@ References:
|
|
|
8
8
|
- Dashboard JSON structure: https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/view-dashboard-json-model/
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
+
import logging
|
|
11
12
|
from typing import Any, Dict, List, Optional
|
|
12
13
|
|
|
13
14
|
from pydantic import BaseModel, ConfigDict, Field
|
|
14
15
|
|
|
15
16
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
16
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
17
19
|
# Grafana-specific type definitions for better type safety
|
|
18
20
|
GrafanaQueryTarget = Dict[
|
|
19
21
|
str, Any
|
|
@@ -89,7 +91,13 @@ class Dashboard(_GrafanaBaseModel):
|
|
|
89
91
|
def parse_obj(cls, data: Dict[str, Any]) -> "Dashboard":
|
|
90
92
|
"""Custom parsing to handle nested panel extraction."""
|
|
91
93
|
dashboard_data = data.get("dashboard", {})
|
|
92
|
-
|
|
94
|
+
_panel_data = dashboard_data.get("panels", [])
|
|
95
|
+
try:
|
|
96
|
+
panels = cls.extract_panels(_panel_data)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.warning(
|
|
99
|
+
f"Error extracting panels from dashboard for dashboard panels {_panel_data} : {e}"
|
|
100
|
+
)
|
|
93
101
|
|
|
94
102
|
# Extract meta.folderId from nested structure
|
|
95
103
|
meta = dashboard_data.get("meta", {})
|
|
@@ -3,11 +3,10 @@ from dataclasses import dataclass
|
|
|
3
3
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
4
4
|
StaleEntityRemovalSourceReport,
|
|
5
5
|
)
|
|
6
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
10
|
-
class GrafanaSourceReport(StaleEntityRemovalSourceReport
|
|
9
|
+
class GrafanaSourceReport(StaleEntityRemovalSourceReport):
|
|
11
10
|
# Entity counters
|
|
12
11
|
dashboards_scanned: int = 0
|
|
13
12
|
charts_scanned: int = 0
|
|
@@ -46,7 +46,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
46
46
|
StatefulIngestionConfigBase,
|
|
47
47
|
StatefulIngestionSourceBase,
|
|
48
48
|
)
|
|
49
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
50
49
|
from datahub.sdk.main_client import DataHubClient
|
|
51
50
|
|
|
52
51
|
|
|
@@ -172,7 +171,6 @@ class HexSourceConfig(
|
|
|
172
171
|
class HexReport(
|
|
173
172
|
StaleEntityRemovalSourceReport,
|
|
174
173
|
HexApiReport,
|
|
175
|
-
IngestionStageReport,
|
|
176
174
|
HexQueryFetcherReport,
|
|
177
175
|
):
|
|
178
176
|
pass
|
|
@@ -89,8 +89,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
89
89
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
90
90
|
LINEAGE_EXTRACTION,
|
|
91
91
|
METADATA_EXTRACTION,
|
|
92
|
-
PROFILING,
|
|
93
92
|
USAGE_EXTRACTION_INGESTION,
|
|
93
|
+
IngestionHighStage,
|
|
94
94
|
)
|
|
95
95
|
from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes, TimeStamp
|
|
96
96
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
@@ -446,7 +446,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
446
446
|
)
|
|
447
447
|
|
|
448
448
|
if self.config.is_profiling_enabled():
|
|
449
|
-
with self.report.
|
|
449
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
450
450
|
profiler = RedshiftProfiler(
|
|
451
451
|
config=self.config,
|
|
452
452
|
report=self.report,
|
|
@@ -4,7 +4,6 @@ from typing import Dict, Optional
|
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
6
6
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
7
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
8
7
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
9
8
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
10
9
|
from datahub.utilities.lossy_collections import LossyDict
|
|
@@ -15,7 +14,6 @@ from datahub.utilities.stats_collections import TopKDict
|
|
|
15
14
|
@dataclass
|
|
16
15
|
class RedshiftReport(
|
|
17
16
|
SQLSourceReport,
|
|
18
|
-
IngestionStageReport,
|
|
19
17
|
BaseTimeWindowReport,
|
|
20
18
|
ClassificationReportMixin,
|
|
21
19
|
):
|
|
@@ -9,7 +9,6 @@ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
|
9
9
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
10
10
|
StatefulIngestionReport,
|
|
11
11
|
)
|
|
12
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
12
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
13
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
14
|
from datahub.utilities.lossy_collections import LossyDict
|
|
@@ -96,7 +95,6 @@ class SnowflakeV2Report(
|
|
|
96
95
|
SnowflakeUsageReport,
|
|
97
96
|
StatefulIngestionReport,
|
|
98
97
|
ClassificationReportMixin,
|
|
99
|
-
IngestionStageReport,
|
|
100
98
|
):
|
|
101
99
|
account_locator: Optional[str] = None
|
|
102
100
|
region: Optional[str] = None
|
|
@@ -77,7 +77,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
|
|
|
77
77
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
78
78
|
LINEAGE_EXTRACTION,
|
|
79
79
|
METADATA_EXTRACTION,
|
|
80
|
-
|
|
80
|
+
IngestionHighStage,
|
|
81
81
|
)
|
|
82
82
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
83
83
|
GlobalTags,
|
|
@@ -360,7 +360,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
360
360
|
yield from self._process_db_schemas(snowflake_db, db_tables)
|
|
361
361
|
|
|
362
362
|
if self.profiler and db_tables:
|
|
363
|
-
with self.report.
|
|
363
|
+
with self.report.new_high_stage(IngestionHighStage.PROFILING):
|
|
364
364
|
yield from self.profiler.get_workunits(snowflake_db, db_tables)
|
|
365
365
|
|
|
366
366
|
def _process_db_schemas(
|