acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +3 -5
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +3 -3
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +6 -12
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +7 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +251 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +29 -5
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -149,7 +149,7 @@ def construct_schema(
|
|
|
149
149
|
|
|
150
150
|
extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
|
|
151
151
|
|
|
152
|
-
for field_path in schema
|
|
152
|
+
for field_path in schema:
|
|
153
153
|
field_types = schema[field_path]["types"]
|
|
154
154
|
field_type: Union[str, type] = "mixed"
|
|
155
155
|
|
|
@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
124
124
|
try:
|
|
125
125
|
self.sigma_api = SigmaAPI(self.config, self.reporter)
|
|
126
126
|
except Exception as e:
|
|
127
|
-
raise ConfigurationError(
|
|
127
|
+
raise ConfigurationError("Unable to connect sigma API") from e
|
|
128
128
|
|
|
129
129
|
@staticmethod
|
|
130
130
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
|
|
|
9
9
|
from tenacity.before_sleep import before_sleep_log
|
|
10
10
|
|
|
11
11
|
import datahub.emitter.mce_builder as builder
|
|
12
|
-
from datahub.configuration.common import ConfigModel
|
|
13
12
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
14
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
15
14
|
from datahub.ingestion.api.decorators import (
|
|
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
|
|
|
18
17
|
platform_name,
|
|
19
18
|
support_status,
|
|
20
19
|
)
|
|
21
|
-
from datahub.ingestion.api.source import
|
|
20
|
+
from datahub.ingestion.api.source import (
|
|
21
|
+
MetadataWorkUnitProcessor,
|
|
22
|
+
SourceReport,
|
|
23
|
+
)
|
|
22
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
26
|
+
StaleEntityRemovalHandler,
|
|
27
|
+
StaleEntityRemovalSourceReport,
|
|
28
|
+
)
|
|
29
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
30
|
+
StatefulIngestionConfigBase,
|
|
31
|
+
StatefulIngestionSourceBase,
|
|
32
|
+
)
|
|
23
33
|
from datahub.metadata.schema_classes import (
|
|
24
34
|
CorpUserEditableInfoClass,
|
|
25
35
|
DatasetPropertiesClass,
|
|
@@ -44,7 +54,9 @@ class CorpUser:
|
|
|
44
54
|
slack_display_name: Optional[str] = None
|
|
45
55
|
|
|
46
56
|
|
|
47
|
-
class SlackSourceConfig(
|
|
57
|
+
class SlackSourceConfig(
|
|
58
|
+
StatefulIngestionConfigBase,
|
|
59
|
+
):
|
|
48
60
|
bot_token: SecretStr = Field(
|
|
49
61
|
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
|
|
50
62
|
)
|
|
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
|
|
|
58
70
|
default=10,
|
|
59
71
|
description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
|
|
60
72
|
)
|
|
61
|
-
ingest_public_channels = Field(
|
|
73
|
+
ingest_public_channels: bool = Field(
|
|
62
74
|
type=bool,
|
|
63
75
|
default=False,
|
|
64
76
|
description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
|
|
65
77
|
)
|
|
66
|
-
channels_iteration_limit = Field(
|
|
78
|
+
channels_iteration_limit: int = Field(
|
|
67
79
|
type=int,
|
|
68
80
|
default=200,
|
|
69
81
|
description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
|
|
70
82
|
)
|
|
71
|
-
channel_min_members = Field(
|
|
83
|
+
channel_min_members: int = Field(
|
|
72
84
|
type=int,
|
|
73
85
|
default=2,
|
|
74
86
|
description="Ingest channels with at least this many members.",
|
|
75
87
|
)
|
|
76
|
-
should_ingest_archived_channels = Field(
|
|
88
|
+
should_ingest_archived_channels: bool = Field(
|
|
77
89
|
type=bool,
|
|
78
90
|
default=False,
|
|
79
91
|
description="Whether to ingest archived channels.",
|
|
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
|
|
|
81
93
|
|
|
82
94
|
|
|
83
95
|
@dataclass
|
|
84
|
-
class SlackSourceReport(
|
|
96
|
+
class SlackSourceReport(StaleEntityRemovalSourceReport):
|
|
85
97
|
channels_reported: int = 0
|
|
86
98
|
archived_channels_reported: int = 0
|
|
87
99
|
|
|
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
|
|
|
92
104
|
@platform_name("Slack")
|
|
93
105
|
@config_class(SlackSourceConfig)
|
|
94
106
|
@support_status(SupportStatus.TESTING)
|
|
95
|
-
class SlackSource(
|
|
107
|
+
class SlackSource(StatefulIngestionSourceBase):
|
|
96
108
|
def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
|
|
109
|
+
super().__init__(config, ctx)
|
|
97
110
|
self.ctx = ctx
|
|
98
111
|
self.config = config
|
|
99
|
-
self.report = SlackSourceReport()
|
|
112
|
+
self.report: SlackSourceReport = SlackSourceReport()
|
|
100
113
|
self.workspace_base_url: Optional[str] = None
|
|
101
114
|
self.rate_limiter = RateLimiter(
|
|
102
115
|
max_calls=self.config.api_requests_per_min, period=60
|
|
@@ -111,6 +124,14 @@ class SlackSource(Source):
|
|
|
111
124
|
def get_slack_client(self) -> WebClient:
|
|
112
125
|
return WebClient(token=self.config.bot_token.get_secret_value())
|
|
113
126
|
|
|
127
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
128
|
+
return [
|
|
129
|
+
*super().get_workunit_processors(),
|
|
130
|
+
StaleEntityRemovalHandler.create(
|
|
131
|
+
self, self.config, self.ctx
|
|
132
|
+
).workunit_processor,
|
|
133
|
+
]
|
|
134
|
+
|
|
114
135
|
def get_workunits_internal(
|
|
115
136
|
self,
|
|
116
137
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
125
125
|
|
|
126
126
|
@pydantic.validator("authentication_type", always=True)
|
|
127
127
|
def authenticator_type_is_valid(cls, v, values):
|
|
128
|
-
if v not in _VALID_AUTH_TYPES
|
|
128
|
+
if v not in _VALID_AUTH_TYPES:
|
|
129
129
|
raise ValueError(
|
|
130
130
|
f"unsupported authenticator type '{v}' was provided,"
|
|
131
131
|
f" use one of {list(_VALID_AUTH_TYPES.keys())}"
|
|
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
312
312
|
raise ValueError(
|
|
313
313
|
f"access_token not found in response {response}. "
|
|
314
314
|
"Please check your OAuth configuration."
|
|
315
|
-
)
|
|
315
|
+
) from None
|
|
316
316
|
connect_args = self.get_options()["connect_args"]
|
|
317
317
|
return snowflake.connector.connect(
|
|
318
318
|
user=self.username,
|
|
@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
403
403
|
res["session_id"],
|
|
404
404
|
res["query_start_time"],
|
|
405
405
|
object_modified_by_ddl,
|
|
406
|
+
res["query_type"],
|
|
406
407
|
)
|
|
407
408
|
if known_ddl_entry:
|
|
408
409
|
return known_ddl_entry
|
|
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
537
538
|
session_id: str,
|
|
538
539
|
timestamp: datetime,
|
|
539
540
|
object_modified_by_ddl: dict,
|
|
541
|
+
query_type: str,
|
|
540
542
|
) -> Optional[Union[TableRename, TableSwap]]:
|
|
541
543
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
542
|
-
if
|
|
543
|
-
"operationType"
|
|
544
|
-
|
|
545
|
-
|
|
544
|
+
if (
|
|
545
|
+
object_modified_by_ddl["operationType"] == "ALTER"
|
|
546
|
+
and query_type == "RENAME_TABLE"
|
|
547
|
+
and object_modified_by_ddl["properties"].get("objectName")
|
|
548
|
+
):
|
|
549
|
+
original_un = self.identifiers.gen_dataset_urn(
|
|
546
550
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
547
551
|
object_modified_by_ddl["objectName"]
|
|
548
552
|
)
|
|
549
553
|
)
|
|
550
554
|
|
|
551
|
-
|
|
555
|
+
new_urn = self.identifiers.gen_dataset_urn(
|
|
552
556
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
553
|
-
object_modified_by_ddl["properties"]["
|
|
557
|
+
object_modified_by_ddl["properties"]["objectName"]["value"]
|
|
554
558
|
)
|
|
555
559
|
)
|
|
556
|
-
|
|
557
|
-
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
560
|
+
return TableRename(original_un, new_urn, query, session_id, timestamp)
|
|
558
561
|
elif object_modified_by_ddl[
|
|
559
562
|
"operationType"
|
|
560
|
-
] == "
|
|
561
|
-
|
|
563
|
+
] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
|
|
564
|
+
urn1 = self.identifiers.gen_dataset_urn(
|
|
562
565
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
563
566
|
object_modified_by_ddl["objectName"]
|
|
564
567
|
)
|
|
565
568
|
)
|
|
566
569
|
|
|
567
|
-
|
|
570
|
+
urn2 = self.identifiers.gen_dataset_urn(
|
|
568
571
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
569
|
-
object_modified_by_ddl["properties"]["
|
|
572
|
+
object_modified_by_ddl["properties"]["swapTargetName"]["value"]
|
|
570
573
|
)
|
|
571
574
|
)
|
|
572
575
|
|
|
573
|
-
return
|
|
576
|
+
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
574
577
|
else:
|
|
575
578
|
self.report.num_ddl_queries_dropped += 1
|
|
576
579
|
return None
|
|
@@ -731,6 +734,9 @@ fingerprinted_queries as (
|
|
|
731
734
|
JOIN filtered_access_history a USING (query_id)
|
|
732
735
|
)
|
|
733
736
|
SELECT * FROM query_access_history
|
|
737
|
+
-- Our query aggregator expects the queries to be added in chronological order.
|
|
738
|
+
-- It's easier for us to push down the sorting to Snowflake/SQL instead of doing it in Python.
|
|
739
|
+
ORDER BY QUERY_START_TIME ASC
|
|
734
740
|
"""
|
|
735
741
|
|
|
736
742
|
|
|
@@ -134,10 +134,11 @@ class SnowflakeQuery:
|
|
|
134
134
|
clustering_key AS "CLUSTERING_KEY",
|
|
135
135
|
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
136
136
|
is_dynamic AS "IS_DYNAMIC",
|
|
137
|
-
is_iceberg AS "IS_ICEBERG"
|
|
137
|
+
is_iceberg AS "IS_ICEBERG",
|
|
138
|
+
is_hybrid AS "IS_HYBRID"
|
|
138
139
|
FROM {db_clause}information_schema.tables t
|
|
139
140
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
140
|
-
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE'
|
|
141
|
+
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
|
|
141
142
|
order by table_schema, table_name"""
|
|
142
143
|
|
|
143
144
|
@staticmethod
|
|
@@ -156,10 +157,11 @@ class SnowflakeQuery:
|
|
|
156
157
|
clustering_key AS "CLUSTERING_KEY",
|
|
157
158
|
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
158
159
|
is_dynamic AS "IS_DYNAMIC",
|
|
159
|
-
is_iceberg AS "IS_ICEBERG"
|
|
160
|
+
is_iceberg AS "IS_ICEBERG",
|
|
161
|
+
is_hybrid AS "IS_HYBRID"
|
|
160
162
|
FROM {db_clause}information_schema.tables t
|
|
161
163
|
where table_schema='{schema_name}'
|
|
162
|
-
and table_type in ('BASE TABLE', 'EXTERNAL TABLE'
|
|
164
|
+
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
163
165
|
order by table_schema, table_name"""
|
|
164
166
|
|
|
165
167
|
@staticmethod
|
|
@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
|
|
|
96
96
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
97
97
|
is_dynamic: bool = False
|
|
98
98
|
is_iceberg: bool = False
|
|
99
|
-
|
|
100
|
-
@property
|
|
101
|
-
def is_hybrid(self) -> bool:
|
|
102
|
-
return self.type is not None and self.type == "HYBRID TABLE"
|
|
99
|
+
is_hybrid: bool = False
|
|
103
100
|
|
|
104
101
|
def get_subtype(self) -> DatasetSubTypes:
|
|
105
102
|
return DatasetSubTypes.TABLE
|
|
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
369
366
|
clustering_key=table["CLUSTERING_KEY"],
|
|
370
367
|
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
371
368
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
369
|
+
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
372
370
|
)
|
|
373
371
|
)
|
|
374
372
|
return tables
|
|
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
395
393
|
clustering_key=table["CLUSTERING_KEY"],
|
|
396
394
|
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
397
395
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
396
|
+
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
398
397
|
)
|
|
399
398
|
)
|
|
400
399
|
return tables
|
|
@@ -55,7 +55,7 @@ try:
|
|
|
55
55
|
except ImportError:
|
|
56
56
|
_F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
|
|
57
57
|
|
|
58
|
-
def override(f: _F, /) -> _F:
|
|
58
|
+
def override(f: _F, /) -> _F:
|
|
59
59
|
return f
|
|
60
60
|
|
|
61
61
|
|
|
@@ -104,7 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
104
104
|
return "\n".join([r for r in res])
|
|
105
105
|
|
|
106
106
|
@typing.no_type_check
|
|
107
|
-
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:
|
|
107
|
+
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine:
|
|
108
108
|
"""Derives the data type of the Athena column.
|
|
109
109
|
|
|
110
110
|
This method is overwritten to extend the behavior of PyAthena.
|
|
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
396
396
|
metadata.table_type if metadata.table_type else ""
|
|
397
397
|
)
|
|
398
398
|
|
|
399
|
-
location: Optional[str] = custom_properties.get("location"
|
|
399
|
+
location: Optional[str] = custom_properties.get("location")
|
|
400
400
|
if location is not None:
|
|
401
401
|
if location.startswith("s3://"):
|
|
402
402
|
location = make_s3_urn(location, self.config.env)
|
|
@@ -538,21 +538,15 @@ class AthenaSource(SQLAlchemySource):
|
|
|
538
538
|
column_name=column["name"],
|
|
539
539
|
column_type=column["type"],
|
|
540
540
|
inspector=inspector,
|
|
541
|
-
description=column.get("comment"
|
|
541
|
+
description=column.get("comment"),
|
|
542
542
|
nullable=column.get("nullable", True),
|
|
543
|
-
is_part_of_key=(
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
and isinstance(pk_constraints, dict)
|
|
548
|
-
and column["name"] in pk_constraints.get("constrained_columns", [])
|
|
549
|
-
)
|
|
550
|
-
else False
|
|
543
|
+
is_part_of_key=bool(
|
|
544
|
+
pk_constraints is not None
|
|
545
|
+
and isinstance(pk_constraints, dict)
|
|
546
|
+
and column["name"] in pk_constraints.get("constrained_columns", [])
|
|
551
547
|
),
|
|
552
|
-
is_partitioning_key=(
|
|
553
|
-
|
|
554
|
-
if (partition_keys is not None and column["name"] in partition_keys)
|
|
555
|
-
else False
|
|
548
|
+
is_partitioning_key=bool(
|
|
549
|
+
partition_keys is not None and column["name"] in partition_keys
|
|
556
550
|
),
|
|
557
551
|
)
|
|
558
552
|
|
|
@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
|
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
52
|
def get_identifier(self, schema: str, table: str) -> str:
|
|
53
|
-
return
|
|
54
|
-
f"{self.platform_instance}.{table}"
|
|
55
|
-
if self.platform_instance
|
|
56
|
-
else f"{table}"
|
|
57
|
-
)
|
|
53
|
+
return f"{table}"
|
|
58
54
|
|
|
59
55
|
|
|
60
56
|
@platform_name("Druid")
|
|
@@ -777,6 +777,7 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
777
777
|
column,
|
|
778
778
|
inspector,
|
|
779
779
|
pk_constraints,
|
|
780
|
+
partition_keys=partition_keys,
|
|
780
781
|
)
|
|
781
782
|
|
|
782
783
|
if self._COMPLEX_TYPE.match(fields[0].nativeDataType) and isinstance(
|
|
@@ -821,12 +822,8 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
821
822
|
|
|
822
823
|
try:
|
|
823
824
|
view_definition = inspector.get_view_definition(view, schema)
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
else:
|
|
827
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
828
|
-
# so we need to convert them to a string.
|
|
829
|
-
view_definition = str(view_definition)
|
|
825
|
+
# Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
|
|
826
|
+
view_definition = str(view_definition) if view_definition else ""
|
|
830
827
|
except NotImplementedError:
|
|
831
828
|
view_definition = ""
|
|
832
829
|
|
|
@@ -853,3 +850,15 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
853
850
|
default_db=default_db,
|
|
854
851
|
default_schema=default_schema,
|
|
855
852
|
)
|
|
853
|
+
|
|
854
|
+
def get_partitions(
|
|
855
|
+
self, inspector: Inspector, schema: str, table: str
|
|
856
|
+
) -> Optional[List[str]]:
|
|
857
|
+
partition_columns: List[dict] = inspector.get_indexes(
|
|
858
|
+
table_name=table, schema=schema
|
|
859
|
+
)
|
|
860
|
+
for partition_column in partition_columns:
|
|
861
|
+
if partition_column.get("column_names"):
|
|
862
|
+
return partition_column.get("column_names")
|
|
863
|
+
|
|
864
|
+
return []
|
|
@@ -67,7 +67,7 @@ TableKey = namedtuple("TableKey", ["schema", "table"])
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class HiveMetastoreConfigMode(StrEnum):
|
|
70
|
-
hive: str = "hive"
|
|
70
|
+
hive: str = "hive"
|
|
71
71
|
presto: str = "presto"
|
|
72
72
|
presto_on_hive: str = "presto-on-hive"
|
|
73
73
|
trino: str = "trino"
|
|
@@ -893,8 +893,9 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
893
893
|
return get_schema_fields_for_hive_column(
|
|
894
894
|
column["col_name"],
|
|
895
895
|
column["col_type"],
|
|
896
|
+
# column is actually an sqlalchemy.engine.row.LegacyRow, not a Dict and we cannot make column.get("col_description", "")
|
|
896
897
|
description=(
|
|
897
|
-
column["col_description"] if "col_description" in column else ""
|
|
898
|
+
column["col_description"] if "col_description" in column else "" # noqa: SIM401
|
|
898
899
|
),
|
|
899
900
|
default_nullable=True,
|
|
900
901
|
)
|
|
@@ -11,12 +11,17 @@ from datahub.emitter.mcp_builder import (
|
|
|
11
11
|
DatabaseKey,
|
|
12
12
|
SchemaKey,
|
|
13
13
|
)
|
|
14
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
15
|
+
FlowContainerSubTypes,
|
|
16
|
+
JobContainerSubTypes,
|
|
17
|
+
)
|
|
14
18
|
from datahub.metadata.schema_classes import (
|
|
15
19
|
ContainerClass,
|
|
16
20
|
DataFlowInfoClass,
|
|
17
21
|
DataJobInfoClass,
|
|
18
22
|
DataJobInputOutputClass,
|
|
19
23
|
DataPlatformInstanceClass,
|
|
24
|
+
SubTypesClass,
|
|
20
25
|
)
|
|
21
26
|
|
|
22
27
|
|
|
@@ -211,6 +216,18 @@ class MSSQLDataJob:
|
|
|
211
216
|
status=self.status,
|
|
212
217
|
)
|
|
213
218
|
|
|
219
|
+
@property
|
|
220
|
+
def as_subtypes_aspect(self) -> SubTypesClass:
|
|
221
|
+
assert isinstance(self.entity, (JobStep, StoredProcedure))
|
|
222
|
+
type = (
|
|
223
|
+
JobContainerSubTypes.MSSQL_JOBSTEP
|
|
224
|
+
if isinstance(self.entity, JobStep)
|
|
225
|
+
else JobContainerSubTypes.MSSQL_STORED_PROCEDURE
|
|
226
|
+
)
|
|
227
|
+
return SubTypesClass(
|
|
228
|
+
typeNames=[type],
|
|
229
|
+
)
|
|
230
|
+
|
|
214
231
|
@property
|
|
215
232
|
def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
|
|
216
233
|
if self.entity.flow.platform_instance:
|
|
@@ -276,6 +293,18 @@ class MSSQLDataFlow:
|
|
|
276
293
|
externalUrl=self.external_url,
|
|
277
294
|
)
|
|
278
295
|
|
|
296
|
+
@property
|
|
297
|
+
def as_subtypes_aspect(self) -> SubTypesClass:
|
|
298
|
+
assert isinstance(self.entity, (MSSQLJob, MSSQLProceduresContainer))
|
|
299
|
+
type = (
|
|
300
|
+
FlowContainerSubTypes.MSSQL_JOB
|
|
301
|
+
if isinstance(self.entity, MSSQLJob)
|
|
302
|
+
else FlowContainerSubTypes.MSSQL_PROCEDURE_CONTAINER
|
|
303
|
+
)
|
|
304
|
+
return SubTypesClass(
|
|
305
|
+
typeNames=[type],
|
|
306
|
+
)
|
|
307
|
+
|
|
279
308
|
@property
|
|
280
309
|
def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
|
|
281
310
|
if self.entity.platform_instance:
|
|
@@ -401,7 +401,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
401
401
|
data_job.add_property(name=data_name, value=str(data_value))
|
|
402
402
|
yield from self.construct_job_workunits(data_job)
|
|
403
403
|
|
|
404
|
-
def loop_stored_procedures(
|
|
404
|
+
def loop_stored_procedures(
|
|
405
405
|
self,
|
|
406
406
|
inspector: Inspector,
|
|
407
407
|
schema: str,
|
|
@@ -638,6 +638,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
638
638
|
aspect=data_job.as_datajob_info_aspect,
|
|
639
639
|
).as_workunit()
|
|
640
640
|
|
|
641
|
+
yield MetadataChangeProposalWrapper(
|
|
642
|
+
entityUrn=data_job.urn,
|
|
643
|
+
aspect=data_job.as_subtypes_aspect,
|
|
644
|
+
).as_workunit()
|
|
645
|
+
|
|
641
646
|
data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
|
|
642
647
|
if data_platform_instance_aspect:
|
|
643
648
|
yield MetadataChangeProposalWrapper(
|
|
@@ -676,8 +681,6 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
676
681
|
),
|
|
677
682
|
).as_workunit()
|
|
678
683
|
|
|
679
|
-
# TODO: Add SubType when it appear
|
|
680
|
-
|
|
681
684
|
def construct_flow_workunits(
|
|
682
685
|
self,
|
|
683
686
|
data_flow: MSSQLDataFlow,
|
|
@@ -687,6 +690,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
687
690
|
aspect=data_flow.as_dataflow_info_aspect,
|
|
688
691
|
).as_workunit()
|
|
689
692
|
|
|
693
|
+
yield MetadataChangeProposalWrapper(
|
|
694
|
+
entityUrn=data_flow.urn,
|
|
695
|
+
aspect=data_flow.as_subtypes_aspect,
|
|
696
|
+
).as_workunit()
|
|
697
|
+
|
|
690
698
|
data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
|
|
691
699
|
if data_platform_instance_aspect:
|
|
692
700
|
yield MetadataChangeProposalWrapper(
|
|
@@ -700,8 +708,6 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
700
708
|
aspect=data_flow.as_container_aspect,
|
|
701
709
|
).as_workunit()
|
|
702
710
|
|
|
703
|
-
# TODO: Add SubType when it appear
|
|
704
|
-
|
|
705
711
|
def get_inspectors(self) -> Iterable[Inspector]:
|
|
706
712
|
# This method can be overridden in the case that you want to dynamically
|
|
707
713
|
# run on multiple databases.
|