acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Dict, List, Optional, Sequence, Type, Union
5
+
6
+ from typing_extensions import Self
7
+
8
+ from datahub.emitter.mce_builder import DEFAULT_ENV
9
+ from datahub.metadata.schema_classes import (
10
+ AspectBag,
11
+ MLModelGroupPropertiesClass,
12
+ )
13
+ from datahub.metadata.urns import DataProcessInstanceUrn, MlModelGroupUrn, Urn
14
+ from datahub.sdk._shared import (
15
+ DomainInputType,
16
+ HasDomain,
17
+ HasInstitutionalMemory,
18
+ HasOwnership,
19
+ HasPlatformInstance,
20
+ HasTags,
21
+ HasTerms,
22
+ LinksInputType,
23
+ OwnersInputType,
24
+ TagsInputType,
25
+ TermsInputType,
26
+ make_time_stamp,
27
+ parse_time_stamp,
28
+ )
29
+ from datahub.sdk.entity import Entity, ExtraAspectsType
30
+
31
+
32
+ class MLModelGroup(
33
+ HasPlatformInstance,
34
+ HasOwnership,
35
+ HasInstitutionalMemory,
36
+ HasTags,
37
+ HasTerms,
38
+ HasDomain,
39
+ Entity,
40
+ ):
41
+ __slots__ = ()
42
+
43
+ @classmethod
44
+ def get_urn_type(cls) -> Type[MlModelGroupUrn]:
45
+ return MlModelGroupUrn
46
+
47
+ def __init__(
48
+ self,
49
+ id: str,
50
+ platform: str,
51
+ name: Optional[str] = "",
52
+ platform_instance: Optional[str] = None,
53
+ env: str = DEFAULT_ENV,
54
+ # Model group properties
55
+ description: Optional[str] = None,
56
+ display_name: Optional[str] = None,
57
+ external_url: Optional[str] = None,
58
+ custom_properties: Optional[Dict[str, str]] = None,
59
+ created: Optional[datetime] = None,
60
+ last_modified: Optional[datetime] = None,
61
+ # Standard aspects
62
+ owners: Optional[OwnersInputType] = None,
63
+ links: Optional[LinksInputType] = None,
64
+ tags: Optional[TagsInputType] = None,
65
+ terms: Optional[TermsInputType] = None,
66
+ domain: Optional[DomainInputType] = None,
67
+ training_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
68
+ downstream_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
69
+ extra_aspects: ExtraAspectsType = None,
70
+ ):
71
+ urn = MlModelGroupUrn(platform=platform, name=id, env=env)
72
+ super().__init__(urn)
73
+ self._set_extra_aspects(extra_aspects)
74
+
75
+ self._set_platform_instance(urn.platform, platform_instance)
76
+
77
+ # Set MLModelGroupProperties aspect
78
+ self._ensure_model_group_props(name=display_name or name)
79
+
80
+ if description is not None:
81
+ self.set_description(description)
82
+ if external_url is not None:
83
+ self.set_external_url(external_url)
84
+ if custom_properties is not None:
85
+ self.set_custom_properties(custom_properties)
86
+ if created is not None:
87
+ self.set_created(created)
88
+ if last_modified is not None:
89
+ self.set_last_modified(last_modified)
90
+
91
+ # Standard aspects
92
+ if owners is not None:
93
+ self.set_owners(owners)
94
+ if links is not None:
95
+ self.set_links(links)
96
+ if tags is not None:
97
+ self.set_tags(tags)
98
+ if terms is not None:
99
+ self.set_terms(terms)
100
+ if domain is not None:
101
+ self.set_domain(domain)
102
+
103
+ # ML model group specific aspects
104
+ if training_jobs is not None:
105
+ self.set_training_jobs(training_jobs)
106
+ if downstream_jobs is not None:
107
+ self.set_downstream_jobs(downstream_jobs)
108
+
109
+ @classmethod
110
+ def _new_from_graph(cls, urn: Urn, current_aspects: AspectBag) -> Self:
111
+ assert isinstance(urn, MlModelGroupUrn)
112
+ entity = cls(
113
+ platform=urn.platform,
114
+ id=urn.name,
115
+ env=urn.env,
116
+ )
117
+ return entity._init_from_graph(current_aspects)
118
+
119
+ @property
120
+ def urn(self) -> MlModelGroupUrn:
121
+ return self._urn # type: ignore
122
+
123
+ def _ensure_model_group_props(
124
+ self, *, name: Optional[str] = None
125
+ ) -> MLModelGroupPropertiesClass:
126
+ if name is not None:
127
+ return self._setdefault_aspect(MLModelGroupPropertiesClass(name=name))
128
+
129
+ props = self._get_aspect(MLModelGroupPropertiesClass)
130
+ if props is None:
131
+ # If we need properties but they don't exist and no name was provided
132
+ return self._setdefault_aspect(
133
+ MLModelGroupPropertiesClass(name=self.urn.name)
134
+ )
135
+ return props
136
+
137
+ @property
138
+ def name(self) -> Optional[str]:
139
+ return self._ensure_model_group_props().name
140
+
141
+ def set_name(self, display_name: str) -> None:
142
+ self._ensure_model_group_props().name = display_name
143
+
144
+ @property
145
+ def description(self) -> Optional[str]:
146
+ return self._ensure_model_group_props().description
147
+
148
+ def set_description(self, description: str) -> None:
149
+ self._ensure_model_group_props().description = description
150
+
151
+ @property
152
+ def external_url(self) -> Optional[str]:
153
+ return self._ensure_model_group_props().externalUrl
154
+
155
+ def set_external_url(self, external_url: str) -> None:
156
+ self._ensure_model_group_props().externalUrl = external_url
157
+
158
+ @property
159
+ def custom_properties(self) -> Optional[Dict[str, str]]:
160
+ return self._ensure_model_group_props().customProperties
161
+
162
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
163
+ self._ensure_model_group_props().customProperties = custom_properties
164
+
165
+ @property
166
+ def created(self) -> Optional[datetime]:
167
+ return parse_time_stamp(self._ensure_model_group_props().created)
168
+
169
+ def set_created(self, created: datetime) -> None:
170
+ self._ensure_model_group_props().created = make_time_stamp(created)
171
+
172
+ @property
173
+ def last_modified(self) -> Optional[datetime]:
174
+ return parse_time_stamp(self._ensure_model_group_props().lastModified)
175
+
176
+ def set_last_modified(self, last_modified: datetime) -> None:
177
+ self._ensure_model_group_props().lastModified = make_time_stamp(last_modified)
178
+
179
+ @property
180
+ def training_jobs(self) -> Optional[List[str]]:
181
+ return self._ensure_model_group_props().trainingJobs
182
+
183
+ def set_training_jobs(
184
+ self, training_jobs: Sequence[Union[str, DataProcessInstanceUrn]]
185
+ ) -> None:
186
+ self._ensure_model_group_props().trainingJobs = [
187
+ str(job) for job in training_jobs
188
+ ]
189
+
190
+ def add_training_job(
191
+ self, training_job: Union[str, DataProcessInstanceUrn]
192
+ ) -> None:
193
+ props = self._ensure_model_group_props()
194
+ if props.trainingJobs is None:
195
+ props.trainingJobs = []
196
+ props.trainingJobs.append(str(training_job))
197
+
198
+ def remove_training_job(
199
+ self, training_job: Union[str, DataProcessInstanceUrn]
200
+ ) -> None:
201
+ props = self._ensure_model_group_props()
202
+ if props.trainingJobs is not None:
203
+ props.trainingJobs = [
204
+ job for job in props.trainingJobs if job != str(training_job)
205
+ ]
206
+
207
+ @property
208
+ def downstream_jobs(self) -> Optional[List[str]]:
209
+ return self._ensure_model_group_props().downstreamJobs
210
+
211
+ def set_downstream_jobs(
212
+ self, downstream_jobs: Sequence[Union[str, DataProcessInstanceUrn]]
213
+ ) -> None:
214
+ self._ensure_model_group_props().downstreamJobs = [
215
+ str(job) for job in downstream_jobs
216
+ ]
217
+
218
+ def add_downstream_job(
219
+ self, downstream_job: Union[str, DataProcessInstanceUrn]
220
+ ) -> None:
221
+ props = self._ensure_model_group_props()
222
+ if props.downstreamJobs is None:
223
+ props.downstreamJobs = []
224
+ props.downstreamJobs.append(str(downstream_job))
225
+
226
+ def remove_downstream_job(
227
+ self, downstream_job: Union[str, DataProcessInstanceUrn]
228
+ ) -> None:
229
+ props = self._ensure_model_group_props()
230
+ if props.downstreamJobs is not None:
231
+ props.downstreamJobs = [
232
+ job for job in props.downstreamJobs if job != str(downstream_job)
233
+ ]
@@ -3,7 +3,8 @@ from typing import Any, Dict, List, Optional, Union
3
3
 
4
4
  from pydantic import BaseModel, validator
5
5
 
6
- from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
6
+ from datahub.ingestion.graph.client import DataHubGraph
7
+ from datahub.ingestion.graph.config import DatahubClientConfig
7
8
  from datahub.secret.datahub_secrets_client import DataHubSecretsClient
8
9
  from datahub.secret.secret_store import SecretStore
9
10
 
@@ -292,3 +292,15 @@ class DatasetPatchBuilder(
292
292
  value=timestamp,
293
293
  )
294
294
  return self
295
+
296
+ def set_external_url(
297
+ self, external_url: Optional[str] = None
298
+ ) -> "DatasetPatchBuilder":
299
+ if external_url is not None:
300
+ self._add_patch(
301
+ DatasetProperties.ASPECT_NAME,
302
+ "add",
303
+ path=("externalUrl",),
304
+ value=external_url,
305
+ )
306
+ return self
@@ -0,0 +1,6 @@
1
+ import hashlib
2
+
3
+
4
+ def generate_hash(text: str) -> str:
5
+ # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
6
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
@@ -32,6 +32,7 @@ from datahub.metadata.urns import (
32
32
  SchemaFieldUrn,
33
33
  Urn,
34
34
  )
35
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
35
36
  from datahub.sql_parsing.schema_resolver import (
36
37
  SchemaResolver,
37
38
  SchemaResolverInterface,
@@ -49,7 +50,6 @@ from datahub.sql_parsing.sqlglot_lineage import (
49
50
  )
50
51
  from datahub.sql_parsing.sqlglot_utils import (
51
52
  _parse_statement,
52
- generate_hash,
53
53
  get_query_fingerprint,
54
54
  try_format_query,
55
55
  )
@@ -155,6 +155,47 @@ class QueryMetadata:
155
155
  actor=(self.actor or _DEFAULT_USER_URN).urn(),
156
156
  )
157
157
 
158
+ def get_subjects(
159
+ self,
160
+ downstream_urn: Optional[str],
161
+ include_fields: bool,
162
+ ) -> List[UrnStr]:
163
+ query_subject_urns = OrderedSet[UrnStr]()
164
+ for upstream in self.upstreams:
165
+ query_subject_urns.add(upstream)
166
+ if include_fields:
167
+ for column in sorted(self.column_usage.get(upstream, [])):
168
+ query_subject_urns.add(
169
+ builder.make_schema_field_urn(upstream, column)
170
+ )
171
+ if downstream_urn:
172
+ query_subject_urns.add(downstream_urn)
173
+ if include_fields:
174
+ for column_lineage in self.column_lineage:
175
+ query_subject_urns.add(
176
+ builder.make_schema_field_urn(
177
+ downstream_urn, column_lineage.downstream.column
178
+ )
179
+ )
180
+ return list(query_subject_urns)
181
+
182
+ def make_query_properties(self) -> models.QueryPropertiesClass:
183
+ return models.QueryPropertiesClass(
184
+ statement=models.QueryStatementClass(
185
+ value=self.formatted_query_string,
186
+ language=models.QueryLanguageClass.SQL,
187
+ ),
188
+ source=models.QuerySourceClass.SYSTEM,
189
+ created=self.make_created_audit_stamp(),
190
+ lastModified=self.make_last_modified_audit_stamp(),
191
+ )
192
+
193
+
194
+ def make_query_subjects(urns: List[UrnStr]) -> models.QuerySubjectsClass:
195
+ return models.QuerySubjectsClass(
196
+ subjects=[models.QuerySubjectClass(entity=urn) for urn in urns]
197
+ )
198
+
158
199
 
159
200
  @dataclasses.dataclass
160
201
  class KnownQueryLineageInfo:
@@ -1440,42 +1481,15 @@ class SqlParsingAggregator(Closeable):
1440
1481
  self.report.num_queries_skipped_due_to_filters += 1
1441
1482
  return
1442
1483
 
1443
- query_subject_urns = OrderedSet[UrnStr]()
1444
- for upstream in query.upstreams:
1445
- query_subject_urns.add(upstream)
1446
- if self.generate_query_subject_fields:
1447
- for column in sorted(query.column_usage.get(upstream, [])):
1448
- query_subject_urns.add(
1449
- builder.make_schema_field_urn(upstream, column)
1450
- )
1451
- if downstream_urn:
1452
- query_subject_urns.add(downstream_urn)
1453
- if self.generate_query_subject_fields:
1454
- for column_lineage in query.column_lineage:
1455
- query_subject_urns.add(
1456
- builder.make_schema_field_urn(
1457
- downstream_urn, column_lineage.downstream.column
1458
- )
1459
- )
1460
-
1461
1484
  yield from MetadataChangeProposalWrapper.construct_many(
1462
1485
  entityUrn=self._query_urn(query_id),
1463
1486
  aspects=[
1464
- models.QueryPropertiesClass(
1465
- statement=models.QueryStatementClass(
1466
- value=query.formatted_query_string,
1467
- language=models.QueryLanguageClass.SQL,
1468
- ),
1469
- source=models.QuerySourceClass.SYSTEM,
1470
- created=query.make_created_audit_stamp(),
1471
- lastModified=query.make_last_modified_audit_stamp(),
1472
- origin=query.origin.urn() if query.origin else None,
1473
- ),
1474
- models.QuerySubjectsClass(
1475
- subjects=[
1476
- models.QuerySubjectClass(entity=urn)
1477
- for urn in query_subject_urns
1478
- ]
1487
+ query.make_query_properties(),
1488
+ make_query_subjects(
1489
+ query.get_subjects(
1490
+ downstream_urn=downstream_urn,
1491
+ include_fields=self.generate_query_subject_fields,
1492
+ )
1479
1493
  ),
1480
1494
  models.DataPlatformInstanceClass(
1481
1495
  platform=self.platform.urn(),
@@ -1,7 +1,6 @@
1
1
  from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
2
2
 
3
3
  import functools
4
- import hashlib
5
4
  import logging
6
5
  import re
7
6
  from typing import Dict, Iterable, Optional, Tuple, Union
@@ -10,6 +9,8 @@ import sqlglot
10
9
  import sqlglot.errors
11
10
  import sqlglot.optimizer.eliminate_ctes
12
11
 
12
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
13
+
13
14
  assert SQLGLOT_PATCHED
14
15
 
15
16
  logger = logging.getLogger(__name__)
@@ -251,13 +252,11 @@ def generalize_query(expression: sqlglot.exp.ExpOrStr, dialect: DialectOrStr) ->
251
252
  return expression.transform(_strip_expression, copy=True).sql(dialect=dialect)
252
253
 
253
254
 
254
- def generate_hash(text: str) -> str:
255
- # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
256
- return hashlib.sha256(text.encode("utf-8")).hexdigest()
257
-
258
-
259
255
  def get_query_fingerprint_debug(
260
- expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
256
+ expression: sqlglot.exp.ExpOrStr,
257
+ platform: DialectOrStr,
258
+ fast: bool = False,
259
+ secondary_id: Optional[str] = None,
261
260
  ) -> Tuple[str, Optional[str]]:
262
261
  try:
263
262
  if not fast:
@@ -272,16 +271,18 @@ def get_query_fingerprint_debug(
272
271
  logger.debug("Failed to generalize query for fingerprinting: %s", e)
273
272
  expression_sql = None
274
273
 
275
- fingerprint = generate_hash(
276
- expression_sql
277
- if expression_sql is not None
278
- else _expression_to_string(expression, platform=platform)
279
- )
274
+ text = expression_sql or _expression_to_string(expression, platform=platform)
275
+ if secondary_id:
276
+ text = text + " -- " + secondary_id
277
+ fingerprint = generate_hash(text=text)
280
278
  return fingerprint, expression_sql
281
279
 
282
280
 
283
281
  def get_query_fingerprint(
284
- expression: sqlglot.exp.ExpOrStr, platform: DialectOrStr, fast: bool = False
282
+ expression: sqlglot.exp.ExpOrStr,
283
+ platform: DialectOrStr,
284
+ fast: bool = False,
285
+ secondary_id: Optional[str] = None,
285
286
  ) -> str:
286
287
  """Get a fingerprint for a SQL query.
287
288
 
@@ -298,12 +299,15 @@ def get_query_fingerprint(
298
299
  Args:
299
300
  expression: The SQL query to fingerprint.
300
301
  platform: The SQL dialect to use.
302
+ secondary_id: An optional additional id string to included in the final fingerprint.
301
303
 
302
304
  Returns:
303
305
  The fingerprint for the SQL query.
304
306
  """
305
307
 
306
- return get_query_fingerprint_debug(expression, platform, fast=fast)[0]
308
+ return get_query_fingerprint_debug(
309
+ expression=expression, platform=platform, fast=fast, secondary_id=secondary_id
310
+ )[0]
307
311
 
308
312
 
309
313
  @functools.lru_cache(maxsize=FORMAT_QUERY_CACHE_SIZE)
@@ -352,10 +352,10 @@ class Telemetry:
352
352
  }
353
353
  else:
354
354
  return {
355
- "server_type": server.server_config.get("datahub", {}).get(
355
+ "server_type": server.server_config.raw_config.get("datahub", {}).get(
356
356
  "serverType", "missing"
357
357
  ),
358
- "server_version": server.server_config.get("versions", {})
358
+ "server_version": server.server_config.raw_config.get("versions", {})
359
359
  .get("acryldata/datahub", {})
360
360
  .get("version", "missing"),
361
361
  "server_id": server.server_id or "missing",
@@ -9,7 +9,7 @@ def ensure_no_indirect_model_imports(dirs: List[pathlib.Path]) -> None:
9
9
  # If our needs become more complex, we should move to a proper linter.
10
10
  denied_imports = {
11
11
  "src.": "datahub.*",
12
- "datahub.metadata._schema_classes": "datahub.metadata.schema_classes",
12
+ "datahub.metadata._internal_schema_classes": "datahub.metadata.schema_classes",
13
13
  "datahub.metadata._urns": "datahub.metadata.urns",
14
14
  }
15
15
  ignored_files = {
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import json
3
3
  import re
4
4
  from collections import defaultdict
5
- from typing import Any, Dict, List, Sequence, Set, Tuple, Union
5
+ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
6
6
 
7
7
  import deepdiff.serialization
8
8
  import yaml
@@ -34,6 +34,7 @@ class AspectForDiff:
34
34
  aspect_name: str
35
35
  aspect: Dict[str, Any] = dataclasses.field(hash=False)
36
36
  delta_info: "DeltaInfo" = dataclasses.field(hash=False, repr=False)
37
+ headers: Optional[Dict[str, str]] = dataclasses.field(default=None, hash=False)
37
38
 
38
39
  @classmethod
39
40
  def create_from_mcp(cls, idx: int, obj: Dict[str, Any]) -> "AspectForDiff":
@@ -44,6 +45,7 @@ class AspectForDiff:
44
45
  aspect_name=obj["aspectName"],
45
46
  aspect=aspect.get("json", aspect),
46
47
  delta_info=DeltaInfo(idx=idx, original=obj),
48
+ headers=obj.get("headers"),
47
49
  )
48
50
 
49
51
  def __repr__(self):
@@ -240,9 +242,12 @@ class MCPDiff:
240
242
  s.append(serialize_aspect(ga.aspect))
241
243
  for (i, old, new), diffs in aspect_diffs.aspects_changed.items():
242
244
  s.append(self.report_aspect(old, i, "changed") + ":")
245
+
246
+ print_aspects = False
243
247
  for diff_level in diffs:
244
248
  s.append(self.report_diff_level(diff_level, i))
245
- if verbose:
249
+ print_aspects |= self.is_diff_level_on_aspect(diff_level)
250
+ if verbose and print_aspects:
246
251
  s.append(f"Old aspect:\n{serialize_aspect(old.aspect)}")
247
252
  s.append(f"New aspect:\n{serialize_aspect(new.aspect)}")
248
253
 
@@ -271,6 +276,14 @@ class MCPDiff:
271
276
  f"root[{idx}].", ""
272
277
  )
273
278
 
279
+ @staticmethod
280
+ def is_diff_level_on_aspect(diff: DiffLevel) -> bool:
281
+ skip_print_fields = ["changeType", "headers"]
282
+ try:
283
+ return diff.path(output_format="list")[1] not in skip_print_fields
284
+ except IndexError:
285
+ return True
286
+
274
287
 
275
288
  def serialize_aspect(aspect: Union[AspectForDiff, Dict[str, Any]]) -> str:
276
289
  if isinstance(aspect, AspectForDiff): # Unpack aspect
@@ -13,7 +13,9 @@ from pydantic import BaseModel
13
13
  from datahub._version import __version__
14
14
  from datahub.cli.config_utils import load_client_config
15
15
  from datahub.ingestion.graph.client import DataHubGraph
16
+ from datahub.ingestion.graph.config import ClientMode
16
17
  from datahub.utilities.perf_timer import PerfTimer
18
+ from datahub.utilities.server_config_util import RestServiceConfig
17
19
 
18
20
  log = logging.getLogger(__name__)
19
21
 
@@ -109,7 +111,7 @@ async def get_github_stats():
109
111
  return (latest_server_version, latest_server_date)
110
112
 
111
113
 
112
- async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
114
+ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceConfig:
113
115
  import aiohttp
114
116
 
115
117
  headers = {
@@ -124,7 +126,7 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
124
126
  config_endpoint = f"{gms_url}/config"
125
127
  async with session.get(config_endpoint, headers=headers) as dh_response:
126
128
  dh_response_json = await dh_response.json()
127
- return dh_response_json
129
+ return RestServiceConfig(raw_config=dh_response_json)
128
130
 
129
131
 
130
132
  async def get_server_version_stats(
@@ -132,11 +134,12 @@ async def get_server_version_stats(
132
134
  ) -> Tuple[Optional[str], Optional[Version], Optional[datetime]]:
133
135
  import aiohttp
134
136
 
135
- server_config = None
137
+ server_config: Optional[RestServiceConfig] = None
136
138
  if not server:
137
139
  try:
138
140
  # let's get the server from the cli config
139
141
  client_config = load_client_config()
142
+ client_config.client_mode = ClientMode.CLI
140
143
  host = client_config.server
141
144
  token = client_config.token
142
145
  server_config = await get_server_config(host, token)
@@ -150,15 +153,10 @@ async def get_server_version_stats(
150
153
  server_version: Optional[Version] = None
151
154
  current_server_release_date = None
152
155
  if server_config:
153
- server_version_string = (
154
- server_config.get("versions", {})
155
- .get("acryldata/datahub", {})
156
- .get("version")
157
- )
158
- commit_hash = (
159
- server_config.get("versions", {}).get("acryldata/datahub", {}).get("commit")
160
- )
161
- server_type = server_config.get("datahub", {}).get("serverType", "unknown")
156
+ server_version_string = server_config.service_version
157
+ commit_hash = server_config.commit_hash
158
+ server_type = server_config.server_type
159
+
162
160
  if server_type == "quickstart" and commit_hash:
163
161
  async with aiohttp.ClientSession(
164
162
  headers={"Accept": "application/vnd.github.v3+json"}
@@ -161,6 +161,7 @@ class _LogBuffer:
161
161
  self._buffer: Deque[str] = collections.deque(maxlen=maxlen)
162
162
 
163
163
  def write(self, line: str) -> None:
164
+ # We do not expect `line` to have a trailing newline.
164
165
  if len(line) > IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH:
165
166
  line = line[:IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH] + "[truncated]"
166
167
 
@@ -188,7 +189,13 @@ class _BufferLogHandler(logging.Handler):
188
189
  message = self.format(record)
189
190
  except TypeError as e:
190
191
  message = f"Error formatting log message: {e}\nMessage: {record.msg}, Args: {record.args}"
191
- self._storage.write(message)
192
+
193
+ # For exception stack traces, the message is split over multiple lines,
194
+ # but we store it as a single string. Because we truncate based on line
195
+ # length, it's better for us to split it into multiple lines so that we
196
+ # don't lose any information on deeper stack traces.
197
+ for line in message.split("\n"):
198
+ self._storage.write(line)
192
199
 
193
200
 
194
201
  def _remove_all_handlers(logger: logging.Logger) -> None: