acryl-datahub 1.3.0.1rc8__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/METADATA +2654 -2647
  2. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/RECORD +74 -74
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/graph/client.py +5 -1
  5. datahub/ingestion/run/pipeline.py +1 -0
  6. datahub/ingestion/source/dremio/dremio_api.py +212 -78
  7. datahub/ingestion/source/dremio/dremio_entities.py +55 -39
  8. datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
  9. datahub/ingestion/source/dremio/dremio_source.py +9 -11
  10. datahub/ingestion/source/elastic_search.py +106 -29
  11. datahub/ingestion/source/snowflake/snowflake_queries.py +27 -3
  12. datahub/ingestion/source/sql_queries.py +164 -15
  13. datahub/metadata/_internal_schema_classes.py +62 -2
  14. datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  15. datahub/metadata/schema.avsc +264 -89
  16. datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
  17. datahub/metadata/schemas/AssertionInfo.avsc +48 -5
  18. datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
  19. datahub/metadata/schemas/ChartInfo.avsc +12 -5
  20. datahub/metadata/schemas/ContainerProperties.avsc +12 -5
  21. datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  22. datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
  23. datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
  24. datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
  25. datahub/metadata/schemas/DashboardInfo.avsc +16 -4
  26. datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
  27. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
  28. datahub/metadata/schemas/DataJobInfo.avsc +9 -4
  29. datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
  30. datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  31. datahub/metadata/schemas/DataProductProperties.avsc +5 -2
  32. datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
  33. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  34. datahub/metadata/schemas/DatasetProperties.avsc +12 -5
  35. datahub/metadata/schemas/DomainProperties.avsc +7 -3
  36. datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
  37. datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  38. datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  39. datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  40. datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  41. datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  42. datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  43. datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  44. datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  45. datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  46. datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  47. datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
  48. datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
  49. datahub/metadata/schemas/GlobalTags.avsc +3 -2
  50. datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  51. datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  52. datahub/metadata/schemas/InputFields.avsc +3 -2
  53. datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
  54. datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
  55. datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  56. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  57. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  58. datahub/metadata/schemas/MLModelProperties.avsc +4 -2
  59. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
  60. datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
  61. datahub/metadata/schemas/NotebookInfo.avsc +5 -2
  62. datahub/metadata/schemas/Ownership.avsc +3 -2
  63. datahub/metadata/schemas/RoleProperties.avsc +3 -1
  64. datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  65. datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
  66. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +9 -3
  67. datahub/metadata/schemas/TagProperties.avsc +3 -1
  68. datahub/metadata/schemas/TestInfo.avsc +2 -1
  69. datahub/sql_parsing/schema_resolver.py +29 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
  71. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/WHEEL +0 -0
  72. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/entry_points.txt +0 -0
  73. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/licenses/LICENSE +0 -0
  74. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import contextlib
2
2
  import pathlib
3
+ from dataclasses import dataclass
3
4
  from typing import Dict, List, Optional, Protocol, Set, Tuple
4
5
 
5
6
  from typing_extensions import TypedDict
@@ -22,6 +23,14 @@ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_fie
22
23
  SchemaInfo = Dict[str, str]
23
24
 
24
25
 
26
+ @dataclass
27
+ class SchemaResolverReport:
28
+ """Report class for tracking SchemaResolver cache performance."""
29
+
30
+ num_schema_cache_hits: int = 0
31
+ num_schema_cache_misses: int = 0
32
+
33
+
25
34
  class GraphQLSchemaField(TypedDict):
26
35
  fieldPath: str
27
36
  nativeDataType: str
@@ -53,6 +62,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
53
62
  env: str = DEFAULT_ENV,
54
63
  graph: Optional[DataHubGraph] = None,
55
64
  _cache_filename: Optional[pathlib.Path] = None,
65
+ report: Optional[SchemaResolverReport] = None,
56
66
  ):
57
67
  # Also supports platform with an urn prefix.
58
68
  self._platform = DataPlatformUrn(platform).platform_name
@@ -60,6 +70,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
60
70
  self.env = env
61
71
 
62
72
  self.graph = graph
73
+ self.report = report
63
74
 
64
75
  # Init cache, potentially restoring from a previous run.
65
76
  shared_conn = None
@@ -132,12 +143,14 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
132
143
 
133
144
  schema_info = self._resolve_schema_info(urn)
134
145
  if schema_info:
146
+ self._track_cache_hit()
135
147
  return urn, schema_info
136
148
 
137
149
  urn_lower = self.get_urn_for_table(table, lower=True)
138
150
  if urn_lower != urn:
139
151
  schema_info = self._resolve_schema_info(urn_lower)
140
152
  if schema_info:
153
+ self._track_cache_hit()
141
154
  return urn_lower, schema_info
142
155
 
143
156
  # Our treatment of platform instances when lowercasing urns
@@ -152,8 +165,12 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
152
165
  if urn_mixed not in {urn, urn_lower}:
153
166
  schema_info = self._resolve_schema_info(urn_mixed)
154
167
  if schema_info:
168
+ self._track_cache_hit()
155
169
  return urn_mixed, schema_info
156
170
 
171
+ # Track cache miss for the final attempt
172
+ self._track_cache_miss()
173
+
157
174
  if self._prefers_urn_lower():
158
175
  return urn_lower, None
159
176
  else:
@@ -165,6 +182,16 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
165
182
  def has_urn(self, urn: str) -> bool:
166
183
  return self._schema_cache.get(urn) is not None
167
184
 
185
+ def _track_cache_hit(self) -> None:
186
+ """Track a cache hit if reporting is enabled."""
187
+ if self.report is not None:
188
+ self.report.num_schema_cache_hits += 1
189
+
190
+ def _track_cache_miss(self) -> None:
191
+ """Track a cache miss if reporting is enabled."""
192
+ if self.report is not None:
193
+ self.report.num_schema_cache_misses += 1
194
+
168
195
  def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]:
169
196
  if urn in self._schema_cache:
170
197
  return self._schema_cache[urn]
@@ -261,6 +288,8 @@ class _SchemaResolverWithExtras(SchemaResolverInterface):
261
288
  table, lower=self._base_resolver._prefers_urn_lower()
262
289
  )
263
290
  if urn in self._extra_schemas:
291
+ # Track cache hit for extra schemas
292
+ self._base_resolver._track_cache_hit()
264
293
  return urn, self._extra_schemas[urn]
265
294
  return self._base_resolver.resolve_table(table)
266
295
 
@@ -168,6 +168,12 @@ class QueryMetadata:
168
168
  query_subject_urns.add(upstream)
169
169
  if include_fields:
170
170
  for column in sorted(self.column_usage.get(upstream, [])):
171
+ # Skip empty column names to avoid creating invalid URNs
172
+ if not column or not column.strip():
173
+ logger.warning(
174
+ f"Skipping empty upstream column name for query {self.query_id} on upstream {upstream}"
175
+ )
176
+ continue
171
177
  query_subject_urns.add(
172
178
  builder.make_schema_field_urn(upstream, column)
173
179
  )
@@ -175,6 +181,15 @@ class QueryMetadata:
175
181
  query_subject_urns.add(downstream_urn)
176
182
  if include_fields:
177
183
  for column_lineage in self.column_lineage:
184
+ # Skip empty downstream columns to avoid creating invalid URNs
185
+ if (
186
+ not column_lineage.downstream.column
187
+ or not column_lineage.downstream.column.strip()
188
+ ):
189
+ logger.warning(
190
+ f"Skipping empty downstream column name for query {self.query_id} on downstream {downstream_urn}"
191
+ )
192
+ continue
178
193
  query_subject_urns.add(
179
194
  builder.make_schema_field_urn(
180
195
  downstream_urn, column_lineage.downstream.column