acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/METADATA +2468 -2461
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/RECORD +74 -74
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +9 -11
- datahub/ingestion/source/elastic_search.py +106 -29
- datahub/ingestion/source/snowflake/snowflake_queries.py +27 -3
- datahub/ingestion/source/sql_queries.py +164 -15
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +264 -89
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +9 -3
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import pathlib
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, List, Optional, Protocol, Set, Tuple
|
|
4
5
|
|
|
5
6
|
from typing_extensions import TypedDict
|
|
@@ -22,6 +23,14 @@ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_fie
|
|
|
22
23
|
SchemaInfo = Dict[str, str]
|
|
23
24
|
|
|
24
25
|
|
|
26
|
+
@dataclass
|
|
27
|
+
class SchemaResolverReport:
|
|
28
|
+
"""Report class for tracking SchemaResolver cache performance."""
|
|
29
|
+
|
|
30
|
+
num_schema_cache_hits: int = 0
|
|
31
|
+
num_schema_cache_misses: int = 0
|
|
32
|
+
|
|
33
|
+
|
|
25
34
|
class GraphQLSchemaField(TypedDict):
|
|
26
35
|
fieldPath: str
|
|
27
36
|
nativeDataType: str
|
|
@@ -53,6 +62,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
53
62
|
env: str = DEFAULT_ENV,
|
|
54
63
|
graph: Optional[DataHubGraph] = None,
|
|
55
64
|
_cache_filename: Optional[pathlib.Path] = None,
|
|
65
|
+
report: Optional[SchemaResolverReport] = None,
|
|
56
66
|
):
|
|
57
67
|
# Also supports platform with an urn prefix.
|
|
58
68
|
self._platform = DataPlatformUrn(platform).platform_name
|
|
@@ -60,6 +70,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
60
70
|
self.env = env
|
|
61
71
|
|
|
62
72
|
self.graph = graph
|
|
73
|
+
self.report = report
|
|
63
74
|
|
|
64
75
|
# Init cache, potentially restoring from a previous run.
|
|
65
76
|
shared_conn = None
|
|
@@ -132,12 +143,14 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
132
143
|
|
|
133
144
|
schema_info = self._resolve_schema_info(urn)
|
|
134
145
|
if schema_info:
|
|
146
|
+
self._track_cache_hit()
|
|
135
147
|
return urn, schema_info
|
|
136
148
|
|
|
137
149
|
urn_lower = self.get_urn_for_table(table, lower=True)
|
|
138
150
|
if urn_lower != urn:
|
|
139
151
|
schema_info = self._resolve_schema_info(urn_lower)
|
|
140
152
|
if schema_info:
|
|
153
|
+
self._track_cache_hit()
|
|
141
154
|
return urn_lower, schema_info
|
|
142
155
|
|
|
143
156
|
# Our treatment of platform instances when lowercasing urns
|
|
@@ -152,8 +165,12 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
152
165
|
if urn_mixed not in {urn, urn_lower}:
|
|
153
166
|
schema_info = self._resolve_schema_info(urn_mixed)
|
|
154
167
|
if schema_info:
|
|
168
|
+
self._track_cache_hit()
|
|
155
169
|
return urn_mixed, schema_info
|
|
156
170
|
|
|
171
|
+
# Track cache miss for the final attempt
|
|
172
|
+
self._track_cache_miss()
|
|
173
|
+
|
|
157
174
|
if self._prefers_urn_lower():
|
|
158
175
|
return urn_lower, None
|
|
159
176
|
else:
|
|
@@ -165,6 +182,16 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
165
182
|
def has_urn(self, urn: str) -> bool:
|
|
166
183
|
return self._schema_cache.get(urn) is not None
|
|
167
184
|
|
|
185
|
+
def _track_cache_hit(self) -> None:
|
|
186
|
+
"""Track a cache hit if reporting is enabled."""
|
|
187
|
+
if self.report is not None:
|
|
188
|
+
self.report.num_schema_cache_hits += 1
|
|
189
|
+
|
|
190
|
+
def _track_cache_miss(self) -> None:
|
|
191
|
+
"""Track a cache miss if reporting is enabled."""
|
|
192
|
+
if self.report is not None:
|
|
193
|
+
self.report.num_schema_cache_misses += 1
|
|
194
|
+
|
|
168
195
|
def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]:
|
|
169
196
|
if urn in self._schema_cache:
|
|
170
197
|
return self._schema_cache[urn]
|
|
@@ -261,6 +288,8 @@ class _SchemaResolverWithExtras(SchemaResolverInterface):
|
|
|
261
288
|
table, lower=self._base_resolver._prefers_urn_lower()
|
|
262
289
|
)
|
|
263
290
|
if urn in self._extra_schemas:
|
|
291
|
+
# Track cache hit for extra schemas
|
|
292
|
+
self._base_resolver._track_cache_hit()
|
|
264
293
|
return urn, self._extra_schemas[urn]
|
|
265
294
|
return self._base_resolver.resolve_table(table)
|
|
266
295
|
|
|
@@ -168,6 +168,12 @@ class QueryMetadata:
|
|
|
168
168
|
query_subject_urns.add(upstream)
|
|
169
169
|
if include_fields:
|
|
170
170
|
for column in sorted(self.column_usage.get(upstream, [])):
|
|
171
|
+
# Skip empty column names to avoid creating invalid URNs
|
|
172
|
+
if not column or not column.strip():
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Skipping empty upstream column name for query {self.query_id} on upstream {upstream}"
|
|
175
|
+
)
|
|
176
|
+
continue
|
|
171
177
|
query_subject_urns.add(
|
|
172
178
|
builder.make_schema_field_urn(upstream, column)
|
|
173
179
|
)
|
|
@@ -175,6 +181,15 @@ class QueryMetadata:
|
|
|
175
181
|
query_subject_urns.add(downstream_urn)
|
|
176
182
|
if include_fields:
|
|
177
183
|
for column_lineage in self.column_lineage:
|
|
184
|
+
# Skip empty downstream columns to avoid creating invalid URNs
|
|
185
|
+
if (
|
|
186
|
+
not column_lineage.downstream.column
|
|
187
|
+
or not column_lineage.downstream.column.strip()
|
|
188
|
+
):
|
|
189
|
+
logger.warning(
|
|
190
|
+
f"Skipping empty downstream column name for query {self.query_id} on downstream {downstream_urn}"
|
|
191
|
+
)
|
|
192
|
+
continue
|
|
178
193
|
query_subject_urns.add(
|
|
179
194
|
builder.make_schema_field_urn(
|
|
180
195
|
downstream_urn, column_lineage.downstream.column
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|