acryl-datahub 1.2.0.11rc4__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/METADATA +2582 -2577
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/RECORD +43 -40
- datahub/_version.py +1 -1
- datahub/cli/docker_check.py +1 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +296 -0
- datahub/ingestion/api/source.py +29 -5
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_profiling.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -2
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/fivetran/config.py +30 -5
- datahub/ingestion/source/fivetran/fivetran.py +0 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +13 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +43 -28
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/grafana/models.py +9 -1
- datahub/ingestion/source/grafana/report.py +1 -2
- datahub/ingestion/source/hex/hex.py +0 -2
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -2
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +2 -2
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/sql_common.py +25 -17
- datahub/ingestion/source/sql/teradata.py +1 -2
- datahub/ingestion/source/sql_queries.py +1 -2
- datahub/ingestion/source/tableau/tableau.py +0 -2
- datahub/ingestion/source/unity/config.py +11 -42
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/report.py +1 -2
- datahub/ingestion/source_report/ingestion_stage.py +54 -12
- datahub/metadata/_internal_schema_classes.py +169 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/schema.avsc +101 -0
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.11rc4.dist-info → acryl_datahub-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -46,6 +46,7 @@ from datahub.ingestion.api.source import (
|
|
|
46
46
|
TestableSource,
|
|
47
47
|
TestConnectionReport,
|
|
48
48
|
)
|
|
49
|
+
from datahub.ingestion.api.source_protocols import MetadataWorkUnitIterable
|
|
49
50
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
50
51
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
51
52
|
SAMPLE_SIZE_MULTIPLIER,
|
|
@@ -578,19 +579,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
578
579
|
self._add_default_options(sql_config)
|
|
579
580
|
|
|
580
581
|
for inspector in self.get_inspectors():
|
|
581
|
-
profiler = None
|
|
582
|
-
profile_requests: List["GEProfilerRequest"] = []
|
|
583
|
-
if sql_config.is_profiling_enabled():
|
|
584
|
-
profiler = self.get_profiler_instance(inspector)
|
|
585
|
-
try:
|
|
586
|
-
self.add_profile_metadata(inspector)
|
|
587
|
-
except Exception as e:
|
|
588
|
-
self.warn(
|
|
589
|
-
logger,
|
|
590
|
-
"profile_metadata",
|
|
591
|
-
f"Failed to get enrichment data for profile {e}",
|
|
592
|
-
)
|
|
593
|
-
|
|
594
582
|
db_name = self.get_db_name(inspector)
|
|
595
583
|
yield from self.get_database_level_workunits(
|
|
596
584
|
inspector=inspector,
|
|
@@ -606,19 +594,39 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
606
594
|
database=db_name,
|
|
607
595
|
)
|
|
608
596
|
|
|
597
|
+
# Generate workunit for aggregated SQL parsing results
|
|
598
|
+
yield from self._generate_aggregator_workunits()
|
|
599
|
+
|
|
600
|
+
def is_profiling_enabled_internal(self) -> bool:
|
|
601
|
+
return self.config.is_profiling_enabled()
|
|
602
|
+
|
|
603
|
+
def get_profiling_internal(
|
|
604
|
+
self,
|
|
605
|
+
) -> MetadataWorkUnitIterable:
|
|
606
|
+
sql_config = self.config
|
|
607
|
+
for inspector in self.get_inspectors():
|
|
608
|
+
profiler = None
|
|
609
|
+
profile_requests: List["GEProfilerRequest"] = []
|
|
610
|
+
profiler = self.get_profiler_instance(inspector)
|
|
611
|
+
try:
|
|
612
|
+
self.add_profile_metadata(inspector)
|
|
613
|
+
except Exception as e:
|
|
614
|
+
self.warn(
|
|
615
|
+
logger,
|
|
616
|
+
"profile_metadata",
|
|
617
|
+
f"Failed to get enrichment data for profile {e}",
|
|
618
|
+
)
|
|
619
|
+
db_name = self.get_db_name(inspector)
|
|
620
|
+
for schema in self.get_allowed_schemas(inspector, db_name):
|
|
609
621
|
if profiler:
|
|
610
622
|
profile_requests += list(
|
|
611
623
|
self.loop_profiler_requests(inspector, schema, sql_config)
|
|
612
624
|
)
|
|
613
|
-
|
|
614
625
|
if profiler and profile_requests:
|
|
615
626
|
yield from self.loop_profiler(
|
|
616
627
|
profile_requests, profiler, platform=self.platform
|
|
617
628
|
)
|
|
618
629
|
|
|
619
|
-
# Generate workunit for aggregated SQL parsing results
|
|
620
|
-
yield from self._generate_aggregator_workunits()
|
|
621
|
-
|
|
622
630
|
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
623
631
|
"""Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
|
|
624
632
|
for mcp in self.aggregator.gen_metadata():
|
|
@@ -51,7 +51,6 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
|
51
51
|
TwoTierSQLAlchemySource,
|
|
52
52
|
)
|
|
53
53
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
54
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
55
54
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
56
55
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
57
56
|
BytesTypeClass,
|
|
@@ -434,7 +433,7 @@ def optimized_get_view_definition(
|
|
|
434
433
|
|
|
435
434
|
|
|
436
435
|
@dataclass
|
|
437
|
-
class TeradataReport(SQLSourceReport,
|
|
436
|
+
class TeradataReport(SQLSourceReport, BaseTimeWindowReport):
|
|
438
437
|
# View processing metrics (actively used)
|
|
439
438
|
num_views_processed: int = 0
|
|
440
439
|
num_view_processing_failures: int = 0
|
|
@@ -40,7 +40,6 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
|
40
40
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
41
41
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
42
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
43
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
44
43
|
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
45
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
46
45
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
@@ -86,7 +85,7 @@ class SqlQueriesSourceConfig(
|
|
|
86
85
|
|
|
87
86
|
|
|
88
87
|
@dataclass
|
|
89
|
-
class SqlQueriesSourceReport(SourceReport
|
|
88
|
+
class SqlQueriesSourceReport(SourceReport):
|
|
90
89
|
num_entries_processed: int = 0
|
|
91
90
|
num_entries_failed: int = 0
|
|
92
91
|
num_queries_aggregator_failures: int = 0
|
|
@@ -120,7 +120,6 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
120
120
|
)
|
|
121
121
|
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
|
|
122
122
|
from datahub.ingestion.source.tableau.tableau_validation import check_user_role
|
|
123
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
124
123
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
125
124
|
AuditStamp,
|
|
126
125
|
ChangeAuditStamps,
|
|
@@ -795,7 +794,6 @@ class SiteIdContentUrl:
|
|
|
795
794
|
@dataclass
|
|
796
795
|
class TableauSourceReport(
|
|
797
796
|
StaleEntityRemovalSourceReport,
|
|
798
|
-
IngestionStageReport,
|
|
799
797
|
):
|
|
800
798
|
get_all_datasources_query_failed: bool = False
|
|
801
799
|
num_get_datasource_query_failures: int = 0
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from datetime import datetime, timedelta, timezone
|
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
|
5
|
-
from urllib.parse import urlparse
|
|
6
5
|
|
|
7
6
|
import pydantic
|
|
8
7
|
from pydantic import Field
|
|
@@ -20,10 +19,8 @@ from datahub.configuration.source_common import (
|
|
|
20
19
|
)
|
|
21
20
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
22
21
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
23
|
-
from datahub.ingestion.source.ge_data_profiler import DATABRICKS
|
|
24
22
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
25
23
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
26
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
27
24
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
25
|
StatefulStaleMetadataRemovalConfig,
|
|
29
26
|
)
|
|
@@ -31,6 +28,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
31
28
|
StatefulIngestionConfigBase,
|
|
32
29
|
StatefulProfilingConfigMixin,
|
|
33
30
|
)
|
|
31
|
+
from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
|
|
34
32
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
35
33
|
from datahub.ingestion.source_config.operation_config import (
|
|
36
34
|
OperationConfig,
|
|
@@ -133,6 +131,7 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
|
|
|
133
131
|
|
|
134
132
|
|
|
135
133
|
class UnityCatalogSourceConfig(
|
|
134
|
+
UnityCatalogConnectionConfig,
|
|
136
135
|
SQLCommonConfig,
|
|
137
136
|
StatefulIngestionConfigBase,
|
|
138
137
|
BaseUsageConfig,
|
|
@@ -140,31 +139,6 @@ class UnityCatalogSourceConfig(
|
|
|
140
139
|
StatefulProfilingConfigMixin,
|
|
141
140
|
LowerCaseDatasetUrnConfigMixin,
|
|
142
141
|
):
|
|
143
|
-
token: str = pydantic.Field(description="Databricks personal access token")
|
|
144
|
-
workspace_url: str = pydantic.Field(
|
|
145
|
-
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
|
146
|
-
)
|
|
147
|
-
warehouse_id: Optional[str] = pydantic.Field(
|
|
148
|
-
default=None,
|
|
149
|
-
description=(
|
|
150
|
-
"SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
|
|
151
|
-
"Required for the following features that need SQL access: "
|
|
152
|
-
"1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
|
|
153
|
-
"2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
|
|
154
|
-
"3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
|
|
155
|
-
"4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
|
|
156
|
-
"When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
|
|
157
|
-
),
|
|
158
|
-
)
|
|
159
|
-
include_hive_metastore: bool = pydantic.Field(
|
|
160
|
-
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
|
161
|
-
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
|
162
|
-
)
|
|
163
|
-
workspace_name: Optional[str] = pydantic.Field(
|
|
164
|
-
default=None,
|
|
165
|
-
description="Name of the workspace. Default to deployment name present in workspace_url",
|
|
166
|
-
)
|
|
167
|
-
|
|
168
142
|
include_metastore: bool = pydantic.Field(
|
|
169
143
|
default=False,
|
|
170
144
|
description=(
|
|
@@ -344,7 +318,15 @@ class UnityCatalogSourceConfig(
|
|
|
344
318
|
_forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
|
|
345
319
|
_forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
|
|
346
320
|
|
|
347
|
-
|
|
321
|
+
include_hive_metastore: bool = pydantic.Field(
|
|
322
|
+
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
|
323
|
+
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
workspace_name: Optional[str] = pydantic.Field(
|
|
327
|
+
default=None,
|
|
328
|
+
description="Name of the workspace. Default to deployment name present in workspace_url",
|
|
329
|
+
)
|
|
348
330
|
|
|
349
331
|
def __init__(self, **data):
|
|
350
332
|
# First, let the parent handle the root validators and field processing
|
|
@@ -386,19 +368,6 @@ class UnityCatalogSourceConfig(
|
|
|
386
368
|
forced_disable_hive_metastore_extraction
|
|
387
369
|
)
|
|
388
370
|
|
|
389
|
-
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
390
|
-
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
|
391
|
-
if database:
|
|
392
|
-
uri_opts["catalog"] = database
|
|
393
|
-
return make_sqlalchemy_uri(
|
|
394
|
-
scheme=self.scheme,
|
|
395
|
-
username="token",
|
|
396
|
-
password=self.token,
|
|
397
|
-
at=urlparse(self.workspace_url).netloc,
|
|
398
|
-
db=database,
|
|
399
|
-
uri_opts=uri_opts,
|
|
400
|
-
)
|
|
401
|
-
|
|
402
371
|
def is_profiling_enabled(self) -> bool:
|
|
403
372
|
return self.profiling.enabled and is_profiling_enabled(
|
|
404
373
|
self.profiling.operation_config
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Databricks Unity Catalog connection configuration."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
|
|
9
|
+
from datahub.configuration.common import ConfigModel
|
|
10
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
11
|
+
|
|
12
|
+
DATABRICKS = "databricks"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnityCatalogConnectionConfig(ConfigModel):
|
|
16
|
+
"""
|
|
17
|
+
Configuration for connecting to Databricks Unity Catalog.
|
|
18
|
+
Contains only connection-related fields that can be reused across different sources.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
scheme: str = DATABRICKS
|
|
22
|
+
token: str = pydantic.Field(description="Databricks personal access token")
|
|
23
|
+
workspace_url: str = pydantic.Field(
|
|
24
|
+
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
|
25
|
+
)
|
|
26
|
+
warehouse_id: Optional[str] = pydantic.Field(
|
|
27
|
+
default=None,
|
|
28
|
+
description=(
|
|
29
|
+
"SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
|
|
30
|
+
"Required for the following features that need SQL access: "
|
|
31
|
+
"1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
|
|
32
|
+
"2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
|
|
33
|
+
"3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
|
|
34
|
+
"4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
|
|
35
|
+
"When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
extra_client_options: Dict[str, Any] = Field(
|
|
40
|
+
default={},
|
|
41
|
+
description="Additional options to pass to Databricks SQLAlchemy client.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def __init__(self, **data: Any):
|
|
45
|
+
super().__init__(**data)
|
|
46
|
+
|
|
47
|
+
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
|
48
|
+
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
|
49
|
+
if database:
|
|
50
|
+
uri_opts["catalog"] = database
|
|
51
|
+
return make_sqlalchemy_uri(
|
|
52
|
+
scheme=self.scheme,
|
|
53
|
+
username="token",
|
|
54
|
+
password=self.token,
|
|
55
|
+
at=urlparse(self.workspace_url).netloc,
|
|
56
|
+
db=database,
|
|
57
|
+
uri_opts=uri_opts,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_options(self) -> dict:
|
|
61
|
+
return self.extra_client_options
|
|
@@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.api.report import EntityFilterReport, Report
|
|
5
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
6
|
-
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
7
6
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
8
7
|
from datahub.utilities.perf_timer import PerfTimer
|
|
9
8
|
|
|
@@ -24,7 +23,7 @@ class UnityCatalogUsagePerfReport(Report):
|
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
@dataclass
|
|
27
|
-
class UnityCatalogReport(
|
|
26
|
+
class UnityCatalogReport(SQLSourceReport):
|
|
28
27
|
metastores: EntityFilterReport = EntityFilterReport.field(type="metastore")
|
|
29
28
|
catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog")
|
|
30
29
|
schemas: EntityFilterReport = EntityFilterReport.field(type="schema")
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from contextlib import AbstractContextManager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime, timezone
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Tuple
|
|
5
8
|
|
|
6
9
|
from datahub.utilities.perf_timer import PerfTimer
|
|
7
10
|
from datahub.utilities.stats_collections import TopKDict
|
|
@@ -20,31 +23,70 @@ QUERIES_EXTRACTION = "Queries Extraction"
|
|
|
20
23
|
PROFILING = "Profiling"
|
|
21
24
|
|
|
22
25
|
|
|
26
|
+
class IngestionHighStage(Enum):
|
|
27
|
+
"""
|
|
28
|
+
The high-level stages at the framework level
|
|
29
|
+
Team to add more stages as needed
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
PROFILING = "Profiling"
|
|
33
|
+
_UNDEFINED = "Ingestion"
|
|
34
|
+
|
|
35
|
+
|
|
23
36
|
@dataclass
|
|
24
37
|
class IngestionStageReport:
|
|
25
|
-
|
|
38
|
+
ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
|
|
39
|
+
default_factory=lambda: defaultdict(float)
|
|
40
|
+
)
|
|
41
|
+
ingestion_stage_durations: TopKDict[Tuple[IngestionHighStage, str], float] = field(
|
|
42
|
+
default_factory=TopKDict
|
|
43
|
+
)
|
|
26
44
|
|
|
27
|
-
def new_stage(
|
|
28
|
-
|
|
45
|
+
def new_stage(
|
|
46
|
+
self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
|
|
47
|
+
) -> "IngestionStageContext":
|
|
48
|
+
return IngestionStageContext(stage, self, high_stage)
|
|
49
|
+
|
|
50
|
+
def new_high_stage(self, stage: IngestionHighStage) -> "IngestionStageContext":
|
|
51
|
+
return IngestionStageContext("", self, stage)
|
|
29
52
|
|
|
30
53
|
|
|
31
54
|
@dataclass
|
|
32
55
|
class IngestionStageContext(AbstractContextManager):
|
|
33
|
-
def __init__(
|
|
34
|
-
self
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
stage: str,
|
|
59
|
+
report: IngestionStageReport,
|
|
60
|
+
high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED,
|
|
61
|
+
):
|
|
62
|
+
self._high_stage = high_stage
|
|
63
|
+
self._ingestion_stage = (
|
|
64
|
+
f"{stage} at {datetime.now(timezone.utc)}" if stage else ""
|
|
65
|
+
)
|
|
35
66
|
self._timer: PerfTimer = PerfTimer()
|
|
36
67
|
self._report = report
|
|
37
68
|
|
|
38
69
|
def __enter__(self) -> "IngestionStageContext":
|
|
39
|
-
|
|
70
|
+
if self._ingestion_stage:
|
|
71
|
+
logger.info(f"Stage started: {self._ingestion_stage}")
|
|
72
|
+
else:
|
|
73
|
+
logger.info(f"High stage started: {self._high_stage.value}")
|
|
40
74
|
self._timer.start()
|
|
41
75
|
return self
|
|
42
76
|
|
|
43
77
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
44
78
|
elapsed = self._timer.elapsed_seconds(digits=2)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
79
|
+
if self._ingestion_stage:
|
|
80
|
+
logger.info(
|
|
81
|
+
f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
|
|
82
|
+
stacklevel=2,
|
|
83
|
+
)
|
|
84
|
+
self._report.ingestion_stage_durations[
|
|
85
|
+
(self._high_stage, self._ingestion_stage)
|
|
86
|
+
] = elapsed
|
|
87
|
+
else:
|
|
88
|
+
logger.info(
|
|
89
|
+
f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
|
|
90
|
+
stacklevel=2,
|
|
91
|
+
)
|
|
92
|
+
self._report.ingestion_high_stage_seconds[self._high_stage] += elapsed
|
|
@@ -21758,6 +21758,153 @@ class ParametersClass(DictWrapper):
|
|
|
21758
21758
|
pass
|
|
21759
21759
|
|
|
21760
21760
|
|
|
21761
|
+
class RelationshipChangeEventClass(DictWrapper):
|
|
21762
|
+
"""Kafka event for proposing a relationship change between two entities.
|
|
21763
|
+
For example, when dataset1 establishes a new downstream relationship with dataset2."""
|
|
21764
|
+
|
|
21765
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.platform.event.v1.RelationshipChangeEvent")
|
|
21766
|
+
def __init__(self,
|
|
21767
|
+
sourceUrn: str,
|
|
21768
|
+
destinationUrn: str,
|
|
21769
|
+
operation: Union[str, "RelationshipChangeOperationClass"],
|
|
21770
|
+
relationshipType: str,
|
|
21771
|
+
auditStamp: "AuditStampClass",
|
|
21772
|
+
auditHeader: Union[None, "KafkaAuditHeaderClass"]=None,
|
|
21773
|
+
lifecycleOwner: Union[None, str]=None,
|
|
21774
|
+
via: Union[None, str]=None,
|
|
21775
|
+
properties: Union[None, Dict[str, str]]=None,
|
|
21776
|
+
):
|
|
21777
|
+
super().__init__()
|
|
21778
|
+
|
|
21779
|
+
self.auditHeader = auditHeader
|
|
21780
|
+
self.sourceUrn = sourceUrn
|
|
21781
|
+
self.destinationUrn = destinationUrn
|
|
21782
|
+
self.operation = operation
|
|
21783
|
+
self.relationshipType = relationshipType
|
|
21784
|
+
self.lifecycleOwner = lifecycleOwner
|
|
21785
|
+
self.via = via
|
|
21786
|
+
self.properties = properties
|
|
21787
|
+
self.auditStamp = auditStamp
|
|
21788
|
+
|
|
21789
|
+
def _restore_defaults(self) -> None:
|
|
21790
|
+
self.auditHeader = self.RECORD_SCHEMA.fields_dict["auditHeader"].default
|
|
21791
|
+
self.sourceUrn = str()
|
|
21792
|
+
self.destinationUrn = str()
|
|
21793
|
+
self.operation = RelationshipChangeOperationClass.ADD
|
|
21794
|
+
self.relationshipType = str()
|
|
21795
|
+
self.lifecycleOwner = self.RECORD_SCHEMA.fields_dict["lifecycleOwner"].default
|
|
21796
|
+
self.via = self.RECORD_SCHEMA.fields_dict["via"].default
|
|
21797
|
+
self.properties = self.RECORD_SCHEMA.fields_dict["properties"].default
|
|
21798
|
+
self.auditStamp = AuditStampClass._construct_with_defaults()
|
|
21799
|
+
|
|
21800
|
+
|
|
21801
|
+
@property
|
|
21802
|
+
def auditHeader(self) -> Union[None, "KafkaAuditHeaderClass"]:
|
|
21803
|
+
"""Kafka audit header containing metadata about the message itself.
|
|
21804
|
+
Includes information like message ID, timestamp, and server details."""
|
|
21805
|
+
return self._inner_dict.get('auditHeader') # type: ignore
|
|
21806
|
+
|
|
21807
|
+
@auditHeader.setter
|
|
21808
|
+
def auditHeader(self, value: Union[None, "KafkaAuditHeaderClass"]) -> None:
|
|
21809
|
+
self._inner_dict['auditHeader'] = value
|
|
21810
|
+
|
|
21811
|
+
|
|
21812
|
+
@property
|
|
21813
|
+
def sourceUrn(self) -> str:
|
|
21814
|
+
"""The URN (Uniform Resource Name) of the source entity in the relationship.
|
|
21815
|
+
In a downstream relationship example, this would be the URN of the upstream dataset."""
|
|
21816
|
+
return self._inner_dict.get('sourceUrn') # type: ignore
|
|
21817
|
+
|
|
21818
|
+
@sourceUrn.setter
|
|
21819
|
+
def sourceUrn(self, value: str) -> None:
|
|
21820
|
+
self._inner_dict['sourceUrn'] = value
|
|
21821
|
+
|
|
21822
|
+
|
|
21823
|
+
@property
|
|
21824
|
+
def destinationUrn(self) -> str:
|
|
21825
|
+
"""The URN of the destination entity in the relationship.
|
|
21826
|
+
In a downstream relationship example, this would be the URN of the downstream dataset."""
|
|
21827
|
+
return self._inner_dict.get('destinationUrn') # type: ignore
|
|
21828
|
+
|
|
21829
|
+
@destinationUrn.setter
|
|
21830
|
+
def destinationUrn(self, value: str) -> None:
|
|
21831
|
+
self._inner_dict['destinationUrn'] = value
|
|
21832
|
+
|
|
21833
|
+
|
|
21834
|
+
@property
|
|
21835
|
+
def operation(self) -> Union[str, "RelationshipChangeOperationClass"]:
|
|
21836
|
+
"""The operation being performed on this relationship.
|
|
21837
|
+
Typically includes operations like ADD, REMOVE, or RESTATE."""
|
|
21838
|
+
return self._inner_dict.get('operation') # type: ignore
|
|
21839
|
+
|
|
21840
|
+
@operation.setter
|
|
21841
|
+
def operation(self, value: Union[str, "RelationshipChangeOperationClass"]) -> None:
|
|
21842
|
+
self._inner_dict['operation'] = value
|
|
21843
|
+
|
|
21844
|
+
|
|
21845
|
+
@property
|
|
21846
|
+
def relationshipType(self) -> str:
|
|
21847
|
+
"""The type/category of relationship being established or modified.
|
|
21848
|
+
Examples: "DownstreamOf", "Contains", "OwnedBy", "DerivedFrom", etc."""
|
|
21849
|
+
return self._inner_dict.get('relationshipType') # type: ignore
|
|
21850
|
+
|
|
21851
|
+
@relationshipType.setter
|
|
21852
|
+
def relationshipType(self, value: str) -> None:
|
|
21853
|
+
self._inner_dict['relationshipType'] = value
|
|
21854
|
+
|
|
21855
|
+
|
|
21856
|
+
@property
|
|
21857
|
+
def lifecycleOwner(self) -> Union[None, str]:
|
|
21858
|
+
"""The system or service responsible for managing the lifecycle of this relationship.
|
|
21859
|
+
This helps identify which component has authority over the relationship."""
|
|
21860
|
+
return self._inner_dict.get('lifecycleOwner') # type: ignore
|
|
21861
|
+
|
|
21862
|
+
@lifecycleOwner.setter
|
|
21863
|
+
def lifecycleOwner(self, value: Union[None, str]) -> None:
|
|
21864
|
+
self._inner_dict['lifecycleOwner'] = value
|
|
21865
|
+
|
|
21866
|
+
|
|
21867
|
+
@property
|
|
21868
|
+
def via(self) -> Union[None, str]:
|
|
21869
|
+
"""Information about how or through what means this relationship was established.
|
|
21870
|
+
Could indicate a specific pipeline, process, or tool that discovered/created the relationship."""
|
|
21871
|
+
return self._inner_dict.get('via') # type: ignore
|
|
21872
|
+
|
|
21873
|
+
@via.setter
|
|
21874
|
+
def via(self, value: Union[None, str]) -> None:
|
|
21875
|
+
self._inner_dict['via'] = value
|
|
21876
|
+
|
|
21877
|
+
|
|
21878
|
+
@property
|
|
21879
|
+
def properties(self) -> Union[None, Dict[str, str]]:
|
|
21880
|
+
"""Additional custom properties associated with this relationship.
|
|
21881
|
+
Allows for flexible extension without changing the schema."""
|
|
21882
|
+
return self._inner_dict.get('properties') # type: ignore
|
|
21883
|
+
|
|
21884
|
+
@properties.setter
|
|
21885
|
+
def properties(self, value: Union[None, Dict[str, str]]) -> None:
|
|
21886
|
+
self._inner_dict['properties'] = value
|
|
21887
|
+
|
|
21888
|
+
|
|
21889
|
+
@property
|
|
21890
|
+
def auditStamp(self) -> "AuditStampClass":
|
|
21891
|
+
"""Stores information about who made this change and when.
|
|
21892
|
+
Contains the actor (user or system) that performed the action and the timestamp."""
|
|
21893
|
+
return self._inner_dict.get('auditStamp') # type: ignore
|
|
21894
|
+
|
|
21895
|
+
@auditStamp.setter
|
|
21896
|
+
def auditStamp(self, value: "AuditStampClass") -> None:
|
|
21897
|
+
self._inner_dict['auditStamp'] = value
|
|
21898
|
+
|
|
21899
|
+
|
|
21900
|
+
class RelationshipChangeOperationClass(object):
|
|
21901
|
+
# No docs available.
|
|
21902
|
+
|
|
21903
|
+
ADD = "ADD"
|
|
21904
|
+
REMOVE = "REMOVE"
|
|
21905
|
+
RESTATE = "RESTATE"
|
|
21906
|
+
|
|
21907
|
+
|
|
21761
21908
|
class PlatformResourceInfoClass(_Aspect):
|
|
21762
21909
|
"""Platform Resource Info.
|
|
21763
21910
|
These entities are for miscelaneous data that is used in non-core parts of the system.
|
|
@@ -25875,6 +26022,7 @@ class StructuredPropertySettingsClass(_Aspect):
|
|
|
25875
26022
|
isHidden: Optional[bool]=None,
|
|
25876
26023
|
showInSearchFilters: Optional[bool]=None,
|
|
25877
26024
|
showInAssetSummary: Optional[bool]=None,
|
|
26025
|
+
hideInAssetSummaryWhenEmpty: Optional[bool]=None,
|
|
25878
26026
|
showAsAssetBadge: Optional[bool]=None,
|
|
25879
26027
|
showInColumnsTable: Optional[bool]=None,
|
|
25880
26028
|
lastModified: Union[None, "AuditStampClass"]=None,
|
|
@@ -25896,6 +26044,11 @@ class StructuredPropertySettingsClass(_Aspect):
|
|
|
25896
26044
|
self.showInAssetSummary = self.RECORD_SCHEMA.fields_dict["showInAssetSummary"].default
|
|
25897
26045
|
else:
|
|
25898
26046
|
self.showInAssetSummary = showInAssetSummary
|
|
26047
|
+
if hideInAssetSummaryWhenEmpty is None:
|
|
26048
|
+
# default: False
|
|
26049
|
+
self.hideInAssetSummaryWhenEmpty = self.RECORD_SCHEMA.fields_dict["hideInAssetSummaryWhenEmpty"].default
|
|
26050
|
+
else:
|
|
26051
|
+
self.hideInAssetSummaryWhenEmpty = hideInAssetSummaryWhenEmpty
|
|
25899
26052
|
if showAsAssetBadge is None:
|
|
25900
26053
|
# default: False
|
|
25901
26054
|
self.showAsAssetBadge = self.RECORD_SCHEMA.fields_dict["showAsAssetBadge"].default
|
|
@@ -25912,6 +26065,7 @@ class StructuredPropertySettingsClass(_Aspect):
|
|
|
25912
26065
|
self.isHidden = self.RECORD_SCHEMA.fields_dict["isHidden"].default
|
|
25913
26066
|
self.showInSearchFilters = self.RECORD_SCHEMA.fields_dict["showInSearchFilters"].default
|
|
25914
26067
|
self.showInAssetSummary = self.RECORD_SCHEMA.fields_dict["showInAssetSummary"].default
|
|
26068
|
+
self.hideInAssetSummaryWhenEmpty = self.RECORD_SCHEMA.fields_dict["hideInAssetSummaryWhenEmpty"].default
|
|
25915
26069
|
self.showAsAssetBadge = self.RECORD_SCHEMA.fields_dict["showAsAssetBadge"].default
|
|
25916
26070
|
self.showInColumnsTable = self.RECORD_SCHEMA.fields_dict["showInColumnsTable"].default
|
|
25917
26071
|
self.lastModified = self.RECORD_SCHEMA.fields_dict["lastModified"].default
|
|
@@ -25947,6 +26101,17 @@ class StructuredPropertySettingsClass(_Aspect):
|
|
|
25947
26101
|
self._inner_dict['showInAssetSummary'] = value
|
|
25948
26102
|
|
|
25949
26103
|
|
|
26104
|
+
@property
|
|
26105
|
+
def hideInAssetSummaryWhenEmpty(self) -> bool:
|
|
26106
|
+
"""Whether or not this asset should be hidden in the asset sidebar (showInAssetSummary should be enabled)
|
|
26107
|
+
when its value is empty"""
|
|
26108
|
+
return self._inner_dict.get('hideInAssetSummaryWhenEmpty') # type: ignore
|
|
26109
|
+
|
|
26110
|
+
@hideInAssetSummaryWhenEmpty.setter
|
|
26111
|
+
def hideInAssetSummaryWhenEmpty(self, value: bool) -> None:
|
|
26112
|
+
self._inner_dict['hideInAssetSummaryWhenEmpty'] = value
|
|
26113
|
+
|
|
26114
|
+
|
|
25950
26115
|
@property
|
|
25951
26116
|
def showAsAssetBadge(self) -> bool:
|
|
25952
26117
|
"""Whether or not this asset should be displayed as an asset badge on other
|
|
@@ -27759,6 +27924,8 @@ __SCHEMA_TYPES = {
|
|
|
27759
27924
|
'com.linkedin.pegasus2avro.persona.DataHubPersonaInfo': DataHubPersonaInfoClass,
|
|
27760
27925
|
'com.linkedin.pegasus2avro.platform.event.v1.EntityChangeEvent': EntityChangeEventClass,
|
|
27761
27926
|
'com.linkedin.pegasus2avro.platform.event.v1.Parameters': ParametersClass,
|
|
27927
|
+
'com.linkedin.pegasus2avro.platform.event.v1.RelationshipChangeEvent': RelationshipChangeEventClass,
|
|
27928
|
+
'com.linkedin.pegasus2avro.platform.event.v1.RelationshipChangeOperation': RelationshipChangeOperationClass,
|
|
27762
27929
|
'com.linkedin.pegasus2avro.platformresource.PlatformResourceInfo': PlatformResourceInfoClass,
|
|
27763
27930
|
'com.linkedin.pegasus2avro.platformresource.PlatformResourceKey': PlatformResourceKeyClass,
|
|
27764
27931
|
'com.linkedin.pegasus2avro.policy.DataHubActorFilter': DataHubActorFilterClass,
|
|
@@ -28279,6 +28446,8 @@ __SCHEMA_TYPES = {
|
|
|
28279
28446
|
'DataHubPersonaInfo': DataHubPersonaInfoClass,
|
|
28280
28447
|
'EntityChangeEvent': EntityChangeEventClass,
|
|
28281
28448
|
'Parameters': ParametersClass,
|
|
28449
|
+
'RelationshipChangeEvent': RelationshipChangeEventClass,
|
|
28450
|
+
'RelationshipChangeOperation': RelationshipChangeOperationClass,
|
|
28282
28451
|
'PlatformResourceInfo': PlatformResourceInfoClass,
|
|
28283
28452
|
'PlatformResourceKey': PlatformResourceKeyClass,
|
|
28284
28453
|
'DataHubActorFilter': DataHubActorFilterClass,
|
|
@@ -9,9 +9,13 @@
|
|
|
9
9
|
# isort: skip_file
|
|
10
10
|
from .......schema_classes import EntityChangeEventClass
|
|
11
11
|
from .......schema_classes import ParametersClass
|
|
12
|
+
from .......schema_classes import RelationshipChangeEventClass
|
|
13
|
+
from .......schema_classes import RelationshipChangeOperationClass
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
EntityChangeEvent = EntityChangeEventClass
|
|
15
17
|
Parameters = ParametersClass
|
|
18
|
+
RelationshipChangeEvent = RelationshipChangeEventClass
|
|
19
|
+
RelationshipChangeOperation = RelationshipChangeOperationClass
|
|
16
20
|
|
|
17
21
|
# fmt: on
|