acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Dict, List, Optional, Sequence, Type, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
9
|
+
from datahub.metadata.schema_classes import (
|
|
10
|
+
AspectBag,
|
|
11
|
+
MLModelGroupPropertiesClass,
|
|
12
|
+
)
|
|
13
|
+
from datahub.metadata.urns import DataProcessInstanceUrn, MlModelGroupUrn, Urn
|
|
14
|
+
from datahub.sdk._shared import (
|
|
15
|
+
DomainInputType,
|
|
16
|
+
HasDomain,
|
|
17
|
+
HasInstitutionalMemory,
|
|
18
|
+
HasOwnership,
|
|
19
|
+
HasPlatformInstance,
|
|
20
|
+
HasTags,
|
|
21
|
+
HasTerms,
|
|
22
|
+
LinksInputType,
|
|
23
|
+
OwnersInputType,
|
|
24
|
+
TagsInputType,
|
|
25
|
+
TermsInputType,
|
|
26
|
+
make_time_stamp,
|
|
27
|
+
parse_time_stamp,
|
|
28
|
+
)
|
|
29
|
+
from datahub.sdk.entity import Entity, ExtraAspectsType
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MLModelGroup(
|
|
33
|
+
HasPlatformInstance,
|
|
34
|
+
HasOwnership,
|
|
35
|
+
HasInstitutionalMemory,
|
|
36
|
+
HasTags,
|
|
37
|
+
HasTerms,
|
|
38
|
+
HasDomain,
|
|
39
|
+
Entity,
|
|
40
|
+
):
|
|
41
|
+
__slots__ = ()
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def get_urn_type(cls) -> Type[MlModelGroupUrn]:
|
|
45
|
+
return MlModelGroupUrn
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
id: str,
|
|
50
|
+
platform: str,
|
|
51
|
+
name: Optional[str] = "",
|
|
52
|
+
platform_instance: Optional[str] = None,
|
|
53
|
+
env: str = DEFAULT_ENV,
|
|
54
|
+
# Model group properties
|
|
55
|
+
description: Optional[str] = None,
|
|
56
|
+
display_name: Optional[str] = None,
|
|
57
|
+
external_url: Optional[str] = None,
|
|
58
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
59
|
+
created: Optional[datetime] = None,
|
|
60
|
+
last_modified: Optional[datetime] = None,
|
|
61
|
+
# Standard aspects
|
|
62
|
+
owners: Optional[OwnersInputType] = None,
|
|
63
|
+
links: Optional[LinksInputType] = None,
|
|
64
|
+
tags: Optional[TagsInputType] = None,
|
|
65
|
+
terms: Optional[TermsInputType] = None,
|
|
66
|
+
domain: Optional[DomainInputType] = None,
|
|
67
|
+
training_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
|
|
68
|
+
downstream_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
|
|
69
|
+
extra_aspects: ExtraAspectsType = None,
|
|
70
|
+
):
|
|
71
|
+
urn = MlModelGroupUrn(platform=platform, name=id, env=env)
|
|
72
|
+
super().__init__(urn)
|
|
73
|
+
self._set_extra_aspects(extra_aspects)
|
|
74
|
+
|
|
75
|
+
self._set_platform_instance(urn.platform, platform_instance)
|
|
76
|
+
|
|
77
|
+
# Set MLModelGroupProperties aspect
|
|
78
|
+
self._ensure_model_group_props(name=display_name or name)
|
|
79
|
+
|
|
80
|
+
if description is not None:
|
|
81
|
+
self.set_description(description)
|
|
82
|
+
if external_url is not None:
|
|
83
|
+
self.set_external_url(external_url)
|
|
84
|
+
if custom_properties is not None:
|
|
85
|
+
self.set_custom_properties(custom_properties)
|
|
86
|
+
if created is not None:
|
|
87
|
+
self.set_created(created)
|
|
88
|
+
if last_modified is not None:
|
|
89
|
+
self.set_last_modified(last_modified)
|
|
90
|
+
|
|
91
|
+
# Standard aspects
|
|
92
|
+
if owners is not None:
|
|
93
|
+
self.set_owners(owners)
|
|
94
|
+
if links is not None:
|
|
95
|
+
self.set_links(links)
|
|
96
|
+
if tags is not None:
|
|
97
|
+
self.set_tags(tags)
|
|
98
|
+
if terms is not None:
|
|
99
|
+
self.set_terms(terms)
|
|
100
|
+
if domain is not None:
|
|
101
|
+
self.set_domain(domain)
|
|
102
|
+
|
|
103
|
+
# ML model group specific aspects
|
|
104
|
+
if training_jobs is not None:
|
|
105
|
+
self.set_training_jobs(training_jobs)
|
|
106
|
+
if downstream_jobs is not None:
|
|
107
|
+
self.set_downstream_jobs(downstream_jobs)
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def _new_from_graph(cls, urn: Urn, current_aspects: AspectBag) -> Self:
|
|
111
|
+
assert isinstance(urn, MlModelGroupUrn)
|
|
112
|
+
entity = cls(
|
|
113
|
+
platform=urn.platform,
|
|
114
|
+
id=urn.name,
|
|
115
|
+
env=urn.env,
|
|
116
|
+
)
|
|
117
|
+
return entity._init_from_graph(current_aspects)
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def urn(self) -> MlModelGroupUrn:
|
|
121
|
+
return self._urn # type: ignore
|
|
122
|
+
|
|
123
|
+
def _ensure_model_group_props(
|
|
124
|
+
self, *, name: Optional[str] = None
|
|
125
|
+
) -> MLModelGroupPropertiesClass:
|
|
126
|
+
if name is not None:
|
|
127
|
+
return self._setdefault_aspect(MLModelGroupPropertiesClass(name=name))
|
|
128
|
+
|
|
129
|
+
props = self._get_aspect(MLModelGroupPropertiesClass)
|
|
130
|
+
if props is None:
|
|
131
|
+
# If we need properties but they don't exist and no name was provided
|
|
132
|
+
return self._setdefault_aspect(
|
|
133
|
+
MLModelGroupPropertiesClass(name=self.urn.name)
|
|
134
|
+
)
|
|
135
|
+
return props
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def name(self) -> Optional[str]:
|
|
139
|
+
return self._ensure_model_group_props().name
|
|
140
|
+
|
|
141
|
+
def set_name(self, display_name: str) -> None:
|
|
142
|
+
self._ensure_model_group_props().name = display_name
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def description(self) -> Optional[str]:
|
|
146
|
+
return self._ensure_model_group_props().description
|
|
147
|
+
|
|
148
|
+
def set_description(self, description: str) -> None:
|
|
149
|
+
self._ensure_model_group_props().description = description
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def external_url(self) -> Optional[str]:
|
|
153
|
+
return self._ensure_model_group_props().externalUrl
|
|
154
|
+
|
|
155
|
+
def set_external_url(self, external_url: str) -> None:
|
|
156
|
+
self._ensure_model_group_props().externalUrl = external_url
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def custom_properties(self) -> Optional[Dict[str, str]]:
|
|
160
|
+
return self._ensure_model_group_props().customProperties
|
|
161
|
+
|
|
162
|
+
def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
|
|
163
|
+
self._ensure_model_group_props().customProperties = custom_properties
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def created(self) -> Optional[datetime]:
|
|
167
|
+
return parse_time_stamp(self._ensure_model_group_props().created)
|
|
168
|
+
|
|
169
|
+
def set_created(self, created: datetime) -> None:
|
|
170
|
+
self._ensure_model_group_props().created = make_time_stamp(created)
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def last_modified(self) -> Optional[datetime]:
|
|
174
|
+
return parse_time_stamp(self._ensure_model_group_props().lastModified)
|
|
175
|
+
|
|
176
|
+
def set_last_modified(self, last_modified: datetime) -> None:
|
|
177
|
+
self._ensure_model_group_props().lastModified = make_time_stamp(last_modified)
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def training_jobs(self) -> Optional[List[str]]:
|
|
181
|
+
return self._ensure_model_group_props().trainingJobs
|
|
182
|
+
|
|
183
|
+
def set_training_jobs(
|
|
184
|
+
self, training_jobs: Sequence[Union[str, DataProcessInstanceUrn]]
|
|
185
|
+
) -> None:
|
|
186
|
+
self._ensure_model_group_props().trainingJobs = [
|
|
187
|
+
str(job) for job in training_jobs
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
def add_training_job(
|
|
191
|
+
self, training_job: Union[str, DataProcessInstanceUrn]
|
|
192
|
+
) -> None:
|
|
193
|
+
props = self._ensure_model_group_props()
|
|
194
|
+
if props.trainingJobs is None:
|
|
195
|
+
props.trainingJobs = []
|
|
196
|
+
props.trainingJobs.append(str(training_job))
|
|
197
|
+
|
|
198
|
+
def remove_training_job(
|
|
199
|
+
self, training_job: Union[str, DataProcessInstanceUrn]
|
|
200
|
+
) -> None:
|
|
201
|
+
props = self._ensure_model_group_props()
|
|
202
|
+
if props.trainingJobs is not None:
|
|
203
|
+
props.trainingJobs = [
|
|
204
|
+
job for job in props.trainingJobs if job != str(training_job)
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def downstream_jobs(self) -> Optional[List[str]]:
|
|
209
|
+
return self._ensure_model_group_props().downstreamJobs
|
|
210
|
+
|
|
211
|
+
def set_downstream_jobs(
|
|
212
|
+
self, downstream_jobs: Sequence[Union[str, DataProcessInstanceUrn]]
|
|
213
|
+
) -> None:
|
|
214
|
+
self._ensure_model_group_props().downstreamJobs = [
|
|
215
|
+
str(job) for job in downstream_jobs
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
def add_downstream_job(
|
|
219
|
+
self, downstream_job: Union[str, DataProcessInstanceUrn]
|
|
220
|
+
) -> None:
|
|
221
|
+
props = self._ensure_model_group_props()
|
|
222
|
+
if props.downstreamJobs is None:
|
|
223
|
+
props.downstreamJobs = []
|
|
224
|
+
props.downstreamJobs.append(str(downstream_job))
|
|
225
|
+
|
|
226
|
+
def remove_downstream_job(
|
|
227
|
+
self, downstream_job: Union[str, DataProcessInstanceUrn]
|
|
228
|
+
) -> None:
|
|
229
|
+
props = self._ensure_model_group_props()
|
|
230
|
+
if props.downstreamJobs is not None:
|
|
231
|
+
props.downstreamJobs = [
|
|
232
|
+
job for job in props.downstreamJobs if job != str(downstream_job)
|
|
233
|
+
]
|
|
@@ -3,7 +3,8 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, validator
|
|
5
5
|
|
|
6
|
-
from datahub.ingestion.graph.client import
|
|
6
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
7
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
7
8
|
from datahub.secret.datahub_secrets_client import DataHubSecretsClient
|
|
8
9
|
from datahub.secret.secret_store import SecretStore
|
|
9
10
|
|
datahub/specific/dataset.py
CHANGED
|
@@ -292,3 +292,15 @@ class DatasetPatchBuilder(
|
|
|
292
292
|
value=timestamp,
|
|
293
293
|
)
|
|
294
294
|
return self
|
|
295
|
+
|
|
296
|
+
def set_external_url(
|
|
297
|
+
self, external_url: Optional[str] = None
|
|
298
|
+
) -> "DatasetPatchBuilder":
|
|
299
|
+
if external_url is not None:
|
|
300
|
+
self._add_patch(
|
|
301
|
+
DatasetProperties.ASPECT_NAME,
|
|
302
|
+
"add",
|
|
303
|
+
path=("externalUrl",),
|
|
304
|
+
value=external_url,
|
|
305
|
+
)
|
|
306
|
+
return self
|
|
@@ -32,6 +32,7 @@ from datahub.metadata.urns import (
|
|
|
32
32
|
SchemaFieldUrn,
|
|
33
33
|
Urn,
|
|
34
34
|
)
|
|
35
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
35
36
|
from datahub.sql_parsing.schema_resolver import (
|
|
36
37
|
SchemaResolver,
|
|
37
38
|
SchemaResolverInterface,
|
|
@@ -49,7 +50,6 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
49
50
|
)
|
|
50
51
|
from datahub.sql_parsing.sqlglot_utils import (
|
|
51
52
|
_parse_statement,
|
|
52
|
-
generate_hash,
|
|
53
53
|
get_query_fingerprint,
|
|
54
54
|
try_format_query,
|
|
55
55
|
)
|
|
@@ -155,6 +155,47 @@ class QueryMetadata:
|
|
|
155
155
|
actor=(self.actor or _DEFAULT_USER_URN).urn(),
|
|
156
156
|
)
|
|
157
157
|
|
|
158
|
+
def get_subjects(
|
|
159
|
+
self,
|
|
160
|
+
downstream_urn: Optional[str],
|
|
161
|
+
include_fields: bool,
|
|
162
|
+
) -> List[UrnStr]:
|
|
163
|
+
query_subject_urns = OrderedSet[UrnStr]()
|
|
164
|
+
for upstream in self.upstreams:
|
|
165
|
+
query_subject_urns.add(upstream)
|
|
166
|
+
if include_fields:
|
|
167
|
+
for column in sorted(self.column_usage.get(upstream, [])):
|
|
168
|
+
query_subject_urns.add(
|
|
169
|
+
builder.make_schema_field_urn(upstream, column)
|
|
170
|
+
)
|
|
171
|
+
if downstream_urn:
|
|
172
|
+
query_subject_urns.add(downstream_urn)
|
|
173
|
+
if include_fields:
|
|
174
|
+
for column_lineage in self.column_lineage:
|
|
175
|
+
query_subject_urns.add(
|
|
176
|
+
builder.make_schema_field_urn(
|
|
177
|
+
downstream_urn, column_lineage.downstream.column
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
return list(query_subject_urns)
|
|
181
|
+
|
|
182
|
+
def make_query_properties(self) -> models.QueryPropertiesClass:
|
|
183
|
+
return models.QueryPropertiesClass(
|
|
184
|
+
statement=models.QueryStatementClass(
|
|
185
|
+
value=self.formatted_query_string,
|
|
186
|
+
language=models.QueryLanguageClass.SQL,
|
|
187
|
+
),
|
|
188
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
189
|
+
created=self.make_created_audit_stamp(),
|
|
190
|
+
lastModified=self.make_last_modified_audit_stamp(),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
|
|
195
|
+
return models.QuerySubjectsClass(
|
|
196
|
+
subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
|
|
197
|
+
)
|
|
198
|
+
|
|
158
199
|
|
|
159
200
|
@dataclasses.dataclass
|
|
160
201
|
class KnownQueryLineageInfo:
|
|
@@ -1440,42 +1481,15 @@ class SqlParsingAggregator(Closeable):
|
|
|
1440
1481
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1441
1482
|
return
|
|
1442
1483
|
|
|
1443
|
-
query_subject_urns = OrderedSet[UrnStr]()
|
|
1444
|
-
for upstream in query.upstreams:
|
|
1445
|
-
query_subject_urns.add(upstream)
|
|
1446
|
-
if self.generate_query_subject_fields:
|
|
1447
|
-
for column in sorted(query.column_usage.get(upstream, [])):
|
|
1448
|
-
query_subject_urns.add(
|
|
1449
|
-
builder.make_schema_field_urn(upstream, column)
|
|
1450
|
-
)
|
|
1451
|
-
if downstream_urn:
|
|
1452
|
-
query_subject_urns.add(downstream_urn)
|
|
1453
|
-
if self.generate_query_subject_fields:
|
|
1454
|
-
for column_lineage in query.column_lineage:
|
|
1455
|
-
query_subject_urns.add(
|
|
1456
|
-
builder.make_schema_field_urn(
|
|
1457
|
-
downstream_urn, column_lineage.downstream.column
|
|
1458
|
-
)
|
|
1459
|
-
)
|
|
1460
|
-
|
|
1461
1484
|
yield from MetadataChangeProposalWrapper.construct_many(
|
|
1462
1485
|
entityUrn=self._query_urn(query_id),
|
|
1463
1486
|
aspects=[
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
created=query.make_created_audit_stamp(),
|
|
1471
|
-
lastModified=query.make_last_modified_audit_stamp(),
|
|
1472
|
-
origin=query.origin.urn() if query.origin else None,
|
|
1473
|
-
),
|
|
1474
|
-
models.QuerySubjectsClass(
|
|
1475
|
-
subjects=[
|
|
1476
|
-
models.QuerySubjectClass(entity=urn)
|
|
1477
|
-
for urn in query_subject_urns
|
|
1478
|
-
]
|
|
1487
|
+
query.make_query_properties(),
|
|
1488
|
+
make_query_subjects(
|
|
1489
|
+
query.get_subjects(
|
|
1490
|
+
downstream_urn=downstream_urn,
|
|
1491
|
+
include_fields=self.generate_query_subject_fields,
|
|
1492
|
+
)
|
|
1479
1493
|
),
|
|
1480
1494
|
models.DataPlatformInstanceClass(
|
|
1481
1495
|
platform=self.platform.urn(),
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
|
-
import hashlib
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
7
6
|
from typing import Dict, Iterable, Optional, Tuple, Union
|
|
@@ -10,6 +9,8 @@ import sqlglot
|
|
|
10
9
|
import sqlglot.errors
|
|
11
10
|
import sqlglot.optimizer.eliminate_ctes
|
|
12
11
|
|
|
12
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
13
|
+
|
|
13
14
|
assert SQLGLOT_PATCHED
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
@@ -251,13 +252,11 @@ def generalize_query(expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr) ->
|
|
|
251
252
|
return expression.transform(_strip_expression, copy=True).sql(dialect=dialect)
|
|
252
253
|
|
|
253
254
|
|
|
254
|
-
def generate_hash(text: str) -> str:
|
|
255
|
-
# Once we move to Python 3.9+, we can set `usedforsecurity=False`.
|
|
256
|
-
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
257
|
-
|
|
258
|
-
|
|
259
255
|
def get_query_fingerprint_debug(
|
|
260
|
-
expression: sqlglot.exp.ExpOrStr,
|
|
256
|
+
expression: sqlglot.exp.ExpOrStr,
|
|
257
|
+
platform: DialectOrStr,
|
|
258
|
+
fast: bool = False,
|
|
259
|
+
secondary_id: Optional[str] = None,
|
|
261
260
|
) -> Tuple[str, Optional[str]]:
|
|
262
261
|
try:
|
|
263
262
|
if not fast:
|
|
@@ -272,16 +271,18 @@ def get_query_fingerprint_debug(
|
|
|
272
271
|
logger.debug("Failed to generalize query for fingerprinting: %s", e)
|
|
273
272
|
expression_sql = None
|
|
274
273
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
)
|
|
274
|
+
text = expression_sql or _expression_to_string(expression, platform=platform)
|
|
275
|
+
if secondary_id:
|
|
276
|
+
text = text + " -- " + secondary_id
|
|
277
|
+
fingerprint = generate_hash(text=text)
|
|
280
278
|
return fingerprint, expression_sql
|
|
281
279
|
|
|
282
280
|
|
|
283
281
|
def get_query_fingerprint(
|
|
284
|
-
expression: sqlglot.exp.ExpOrStr,
|
|
282
|
+
expression: sqlglot.exp.ExpOrStr,
|
|
283
|
+
platform: DialectOrStr,
|
|
284
|
+
fast: bool = False,
|
|
285
|
+
secondary_id: Optional[str] = None,
|
|
285
286
|
) -> str:
|
|
286
287
|
"""Get a fingerprint for a SQL query.
|
|
287
288
|
|
|
@@ -298,12 +299,15 @@ def get_query_fingerprint(
|
|
|
298
299
|
Args:
|
|
299
300
|
expression: The SQL query to fingerprint.
|
|
300
301
|
platform: The SQL dialect to use.
|
|
302
|
+
secondary_id: An optional additional id string to included in the final fingerprint.
|
|
301
303
|
|
|
302
304
|
Returns:
|
|
303
305
|
The fingerprint for the SQL query.
|
|
304
306
|
"""
|
|
305
307
|
|
|
306
|
-
return get_query_fingerprint_debug(
|
|
308
|
+
return get_query_fingerprint_debug(
|
|
309
|
+
expression=expression, platform=platform, fast=fast, secondary_id=secondary_id
|
|
310
|
+
)[0]
|
|
307
311
|
|
|
308
312
|
|
|
309
313
|
@functools.lru_cache(maxsize=FORMAT_QUERY_CACHE_SIZE)
|
datahub/telemetry/telemetry.py
CHANGED
|
@@ -352,10 +352,10 @@ class Telemetry:
|
|
|
352
352
|
}
|
|
353
353
|
else:
|
|
354
354
|
return {
|
|
355
|
-
"server_type": server.server_config.get("datahub", {}).get(
|
|
355
|
+
"server_type": server.server_config.raw_config.get("datahub", {}).get(
|
|
356
356
|
"serverType", "missing"
|
|
357
357
|
),
|
|
358
|
-
"server_version": server.server_config.get("versions", {})
|
|
358
|
+
"server_version": server.server_config.raw_config.get("versions", {})
|
|
359
359
|
.get("acryldata/datahub", {})
|
|
360
360
|
.get("version", "missing"),
|
|
361
361
|
"server_id": server.server_id or "missing",
|
datahub/testing/check_imports.py
CHANGED
|
@@ -9,7 +9,7 @@ def ensure_no_indirect_model_imports(dirs: List[pathlib.Path]) -> None:
|
|
|
9
9
|
# If our needs become more complex, we should move to a proper linter.
|
|
10
10
|
denied_imports = {
|
|
11
11
|
"src.": "datahub.*",
|
|
12
|
-
"datahub.metadata.
|
|
12
|
+
"datahub.metadata._internal_schema_classes": "datahub.metadata.schema_classes",
|
|
13
13
|
"datahub.metadata._urns": "datahub.metadata.urns",
|
|
14
14
|
}
|
|
15
15
|
ignored_files = {
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -2,7 +2,7 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Any, Dict, List, Sequence, Set, Tuple, Union
|
|
5
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import deepdiff.serialization
|
|
8
8
|
import yaml
|
|
@@ -34,6 +34,7 @@ class AspectForDiff:
|
|
|
34
34
|
aspect_name: str
|
|
35
35
|
aspect: Dict[str, Any] = dataclasses.field(hash=False)
|
|
36
36
|
delta_info: "DeltaInfo" = dataclasses.field(hash=False, repr=False)
|
|
37
|
+
headers: Optional[Dict[str, str]] = dataclasses.field(default=None, hash=False)
|
|
37
38
|
|
|
38
39
|
@classmethod
|
|
39
40
|
def create_from_mcp(cls, idx: int, obj: Dict[str, Any]) -> "AspectForDiff":
|
|
@@ -44,6 +45,7 @@ class AspectForDiff:
|
|
|
44
45
|
aspect_name=obj["aspectName"],
|
|
45
46
|
aspect=aspect.get("json", aspect),
|
|
46
47
|
delta_info=DeltaInfo(idx=idx, original=obj),
|
|
48
|
+
headers=obj.get("headers"),
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
def __repr__(self):
|
|
@@ -240,9 +242,12 @@ class MCPDiff:
|
|
|
240
242
|
s.append(serialize_aspect(ga.aspect))
|
|
241
243
|
for (i, old, new), diffs in aspect_diffs.aspects_changed.items():
|
|
242
244
|
s.append(self.report_aspect(old, i, "changed") + ":")
|
|
245
|
+
|
|
246
|
+
print_aspects = False
|
|
243
247
|
for diff_level in diffs:
|
|
244
248
|
s.append(self.report_diff_level(diff_level, i))
|
|
245
|
-
|
|
249
|
+
print_aspects |= self.is_diff_level_on_aspect(diff_level)
|
|
250
|
+
if verbose and print_aspects:
|
|
246
251
|
s.append(f"Old aspect:\n{serialize_aspect(old.aspect)}")
|
|
247
252
|
s.append(f"New aspect:\n{serialize_aspect(new.aspect)}")
|
|
248
253
|
|
|
@@ -271,6 +276,14 @@ class MCPDiff:
|
|
|
271
276
|
f"root[{idx}].", ""
|
|
272
277
|
)
|
|
273
278
|
|
|
279
|
+
@staticmethod
|
|
280
|
+
def is_diff_level_on_aspect(diff: DiffLevel) -> bool:
|
|
281
|
+
skip_print_fields = ["changeType", "headers"]
|
|
282
|
+
try:
|
|
283
|
+
return diff.path(output_format="list")[1] not in skip_print_fields
|
|
284
|
+
except IndexError:
|
|
285
|
+
return True
|
|
286
|
+
|
|
274
287
|
|
|
275
288
|
def serialize_aspect(aspect: Union[AspectForDiff, Dict[str, Any]]) -> str:
|
|
276
289
|
if isinstance(aspect, AspectForDiff): # Unpack aspect
|
datahub/upgrade/upgrade.py
CHANGED
|
@@ -13,7 +13,9 @@ from pydantic import BaseModel
|
|
|
13
13
|
from datahub._version import __version__
|
|
14
14
|
from datahub.cli.config_utils import load_client_config
|
|
15
15
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
16
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
16
17
|
from datahub.utilities.perf_timer import PerfTimer
|
|
18
|
+
from datahub.utilities.server_config_util import RestServiceConfig
|
|
17
19
|
|
|
18
20
|
log = logging.getLogger(__name__)
|
|
19
21
|
|
|
@@ -109,7 +111,7 @@ async def get_github_stats():
|
|
|
109
111
|
return (latest_server_version, latest_server_date)
|
|
110
112
|
|
|
111
113
|
|
|
112
|
-
async def get_server_config(gms_url: str, token: Optional[str]) ->
|
|
114
|
+
async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceConfig:
|
|
113
115
|
import aiohttp
|
|
114
116
|
|
|
115
117
|
headers = {
|
|
@@ -124,7 +126,7 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
|
|
|
124
126
|
config_endpoint = f"{gms_url}/config"
|
|
125
127
|
async with session.get(config_endpoint, headers=headers) as dh_response:
|
|
126
128
|
dh_response_json = await dh_response.json()
|
|
127
|
-
return dh_response_json
|
|
129
|
+
return RestServiceConfig(raw_config=dh_response_json)
|
|
128
130
|
|
|
129
131
|
|
|
130
132
|
async def get_server_version_stats(
|
|
@@ -132,11 +134,12 @@ async def get_server_version_stats(
|
|
|
132
134
|
) -> Tuple[Optional[str], Optional[Version], Optional[datetime]]:
|
|
133
135
|
import aiohttp
|
|
134
136
|
|
|
135
|
-
server_config = None
|
|
137
|
+
server_config: Optional[RestServiceConfig] = None
|
|
136
138
|
if not server:
|
|
137
139
|
try:
|
|
138
140
|
# let's get the server from the cli config
|
|
139
141
|
client_config = load_client_config()
|
|
142
|
+
client_config.client_mode = ClientMode.CLI
|
|
140
143
|
host = client_config.server
|
|
141
144
|
token = client_config.token
|
|
142
145
|
server_config = await get_server_config(host, token)
|
|
@@ -150,15 +153,10 @@ async def get_server_version_stats(
|
|
|
150
153
|
server_version: Optional[Version] = None
|
|
151
154
|
current_server_release_date = None
|
|
152
155
|
if server_config:
|
|
153
|
-
server_version_string =
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
)
|
|
158
|
-
commit_hash = (
|
|
159
|
-
server_config.get("versions", {}).get("acryldata/datahub", {}).get("commit")
|
|
160
|
-
)
|
|
161
|
-
server_type = server_config.get("datahub", {}).get("serverType", "unknown")
|
|
156
|
+
server_version_string = server_config.service_version
|
|
157
|
+
commit_hash = server_config.commit_hash
|
|
158
|
+
server_type = server_config.server_type
|
|
159
|
+
|
|
162
160
|
if server_type == "quickstart" and commit_hash:
|
|
163
161
|
async with aiohttp.ClientSession(
|
|
164
162
|
headers={"Accept": "application/vnd.github.v3+json"}
|
|
@@ -161,6 +161,7 @@ class _LogBuffer:
|
|
|
161
161
|
self._buffer: Deque[str] = collections.deque(maxlen=maxlen)
|
|
162
162
|
|
|
163
163
|
def write(self, line: str) -> None:
|
|
164
|
+
# We do not expect `line` to have a trailing newline.
|
|
164
165
|
if len(line) > IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH:
|
|
165
166
|
line = line[:IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH] + "[truncated]"
|
|
166
167
|
|
|
@@ -188,7 +189,13 @@ class _BufferLogHandler(logging.Handler):
|
|
|
188
189
|
message = self.format(record)
|
|
189
190
|
except TypeError as e:
|
|
190
191
|
message = f"Error formatting log message: {e}\nMessage: {record.msg}, Args: {record.args}"
|
|
191
|
-
|
|
192
|
+
|
|
193
|
+
# For exception stack traces, the message is split over multiple lines,
|
|
194
|
+
# but we store it as a single string. Because we truncate based on line
|
|
195
|
+
# length, it's better for us to split it into multiple lines so that we
|
|
196
|
+
# don't lose any information on deeper stack traces.
|
|
197
|
+
for line in message.split("\n"):
|
|
198
|
+
self._storage.write(line)
|
|
192
199
|
|
|
193
200
|
|
|
194
201
|
def _remove_all_handlers(logger: logging.Logger) -> None:
|