acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
@@ -6,19 +6,25 @@ from collections import defaultdict
6
6
  from dataclasses import dataclass, field
7
7
  from datetime import datetime, timedelta
8
8
  from enum import Enum
9
- from typing import Any, Dict, Optional, Set, cast, runtime_checkable
9
+ from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
10
10
 
11
11
  import humanfriendly
12
12
  import pydantic
13
13
  from pydantic import BaseModel
14
14
  from typing_extensions import Literal, Protocol
15
15
 
16
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
17
  from datahub.emitter.mcp_builder import mcps_from_mce
17
18
  from datahub.ingestion.api.closeable import Closeable
18
19
  from datahub.ingestion.api.report_helpers import format_datetime_relative
19
20
  from datahub.ingestion.api.workunit import MetadataWorkUnit
21
+ from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
20
22
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
21
- from datahub.metadata.schema_classes import SubTypesClass, UpstreamLineageClass
23
+ from datahub.metadata.schema_classes import (
24
+ MetadataChangeProposalClass,
25
+ SubTypesClass,
26
+ UpstreamLineageClass,
27
+ )
22
28
  from datahub.utilities.file_backed_collections import FileBackedDict
23
29
  from datahub.utilities.lossy_collections import LossyList
24
30
 
@@ -125,8 +131,6 @@ class ReportAttribute(BaseModel):
125
131
 
126
132
  @dataclass
127
133
  class ExamplesReport(Report, Closeable):
128
- _urns_seen: Set[str] = field(default_factory=set)
129
- entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
130
134
  aspects: Dict[str, Dict[str, int]] = field(
131
135
  default_factory=lambda: defaultdict(lambda: defaultdict(int))
132
136
  )
@@ -135,11 +139,16 @@ class ExamplesReport(Report, Closeable):
135
139
  lambda: defaultdict(lambda: defaultdict(int))
136
140
  )
137
141
  )
138
- aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
139
- default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
142
+ samples: Dict[str, Dict[str, List[str]]] = field(
143
+ default_factory=lambda: defaultdict(lambda: defaultdict(list))
140
144
  )
141
145
  _file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
142
146
 
147
+ # We are adding this to make querying easier for fine-grained lineage
148
+ _fine_grained_lineage_special_case_name = "fineGrainedLineages"
149
+ _samples_to_add: int = 20
150
+ _lineage_aspects_seen: Set[str] = field(default_factory=set)
151
+
143
152
  def __post_init__(self) -> None:
144
153
  self._file_based_dict = FileBackedDict(
145
154
  tablename="urn_aspects",
@@ -157,6 +166,151 @@ class ExamplesReport(Report, Closeable):
157
166
  self._file_based_dict.close()
158
167
  self._file_based_dict = None
159
168
 
169
+ def _build_aspects_where_clause(self, aspects: List[str]) -> str:
170
+ """Build WHERE clause for matching any of the given aspects."""
171
+ if not aspects:
172
+ return ""
173
+
174
+ conditions = []
175
+ for aspect in aspects:
176
+ conditions.append(f"aspects LIKE '%{aspect}%'")
177
+
178
+ return " OR ".join(conditions)
179
+
180
+ def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
181
+ """Helper method to collect samples organized by subtype for a given where clause."""
182
+
183
+ subtype_query = f"""
184
+ SELECT DISTINCT subTypes
185
+ FROM urn_aspects
186
+ WHERE {where_clause}
187
+ """
188
+ assert self._file_based_dict is not None
189
+ subtypes = set()
190
+ for row in self._file_based_dict.sql_query(subtype_query):
191
+ sub_type = row["subTypes"] or "unknown"
192
+ subtypes.add(sub_type)
193
+
194
+ for sub_type in subtypes:
195
+ query = f"""
196
+ SELECT urn
197
+ FROM urn_aspects
198
+ WHERE {where_clause} AND subTypes = ?
199
+ limit {self._samples_to_add}
200
+ """
201
+
202
+ for row in self._file_based_dict.sql_query(query, (sub_type,)):
203
+ self.samples[sample_key][sub_type].append(row["urn"])
204
+
205
+ def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
206
+ """Helper method to collect samples for entities that have any of the given aspects."""
207
+ if not aspects:
208
+ return
209
+
210
+ where_clause = self._build_aspects_where_clause(aspects)
211
+ self._collect_samples_by_subtype(where_clause, sample_key)
212
+
213
+ def _collect_samples_by_lineage_aspects(
214
+ self, aspects: List[str], sample_key: str
215
+ ) -> None:
216
+ """Helper method to collect samples for entities that have any of the given lineage aspects.
217
+
218
+ Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
219
+ """
220
+ if not aspects:
221
+ return
222
+
223
+ lineage_conditions = []
224
+ for aspect in aspects:
225
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
226
+
227
+ where_clause = " OR ".join(lineage_conditions)
228
+ self._collect_samples_by_subtype(where_clause, sample_key)
229
+
230
+ def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
231
+ """
232
+ Collect samples for entities that have lineage, profiling, and usage aspects.
233
+ These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
234
+ """
235
+ if not self._lineage_aspects_seen:
236
+ return
237
+ assert self._file_based_dict is not None
238
+
239
+ # Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
240
+ lineage_conditions = []
241
+ for aspect in self._lineage_aspects_seen:
242
+ lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
243
+ lineage_where_clause = " OR ".join(lineage_conditions)
244
+
245
+ # Build profiling conditions using the same logic as _collect_samples_by_aspects
246
+ profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
247
+
248
+ # Build usage conditions using the same logic as _collect_samples_by_aspects
249
+ usage_where_clause = self._build_aspects_where_clause(
250
+ [
251
+ "datasetUsageStatistics",
252
+ "chartUsageStatistics",
253
+ "dashboardUsageStatistics",
254
+ ]
255
+ )
256
+
257
+ query = f"""
258
+ SELECT urn, subTypes
259
+ FROM urn_aspects
260
+ WHERE ({lineage_where_clause})
261
+ AND ({profiling_where_clause})
262
+ AND ({usage_where_clause})
263
+ limit {self._samples_to_add}
264
+ """
265
+
266
+ for row in self._file_based_dict.sql_query(query):
267
+ sub_type = row["subTypes"] or "unknown"
268
+ self.samples[sample_key][sub_type].append(row["urn"])
269
+
270
+ def _has_fine_grained_lineage(
271
+ self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
272
+ ) -> bool:
273
+ if isinstance(mcp.aspect, UpstreamLineageClass):
274
+ upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
275
+ if upstream_lineage.fineGrainedLineages:
276
+ return True
277
+ return False
278
+
279
+ def _update_file_based_dict(
280
+ self,
281
+ urn: str,
282
+ entityType: str,
283
+ aspectName: str,
284
+ mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
285
+ ) -> None:
286
+ if is_lineage_aspect(entityType, aspectName):
287
+ self._lineage_aspects_seen.add(aspectName)
288
+ has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
289
+
290
+ sub_type = "unknown"
291
+ if isinstance(mcp.aspect, SubTypesClass):
292
+ sub_type = mcp.aspect.typeNames[0]
293
+
294
+ assert self._file_based_dict is not None
295
+ if urn in self._file_based_dict:
296
+ if sub_type != "unknown":
297
+ self._file_based_dict[urn].subType = sub_type
298
+ self._file_based_dict[urn].aspects.add(aspectName)
299
+ if has_fine_grained_lineage:
300
+ self._file_based_dict[urn].aspects.add(
301
+ self._fine_grained_lineage_special_case_name
302
+ )
303
+ self._file_based_dict.mark_dirty(urn)
304
+ else:
305
+ self._file_based_dict[urn] = SourceReportSubtypes(
306
+ urn=urn,
307
+ entity_type=entityType,
308
+ subType=sub_type,
309
+ aspects={aspectName}
310
+ if not has_fine_grained_lineage
311
+ else {aspectName, self._fine_grained_lineage_special_case_name},
312
+ )
313
+
160
314
  def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
161
315
  urn = wu.get_urn()
162
316
 
@@ -169,41 +323,15 @@ class ExamplesReport(Report, Closeable):
169
323
  entityType = mcp.entityType
170
324
  aspectName = mcp.aspectName
171
325
 
172
- if urn not in self._urns_seen:
173
- self._urns_seen.add(urn)
174
- self.entities[entityType].append(urn)
175
-
176
326
  if aspectName is None:
177
327
  continue
178
- self.aspects[entityType][aspectName] += 1
179
- self.aspect_urn_samples[entityType][aspectName].append(urn)
180
- sub_type = "unknown"
181
- if isinstance(mcp.aspect, UpstreamLineageClass):
182
- upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
183
- if upstream_lineage.fineGrainedLineages:
184
- self.aspect_urn_samples[entityType]["fineGrainedLineages"].append(
185
- urn
186
- )
187
- self.aspects[entityType]["fineGrainedLineages"] += 1
188
- elif isinstance(mcp.aspect, SubTypesClass):
189
- sub_type = mcp.aspect.typeNames[0]
190
- assert self._file_based_dict is not None
191
- if urn in self._file_based_dict:
192
- if sub_type != "unknown":
193
- self._file_based_dict[urn].subType = sub_type
194
- self._file_based_dict[urn].aspects.add(aspectName)
195
- self._file_based_dict.mark_dirty(urn)
196
- else:
197
- self._file_based_dict[urn] = SourceReportSubtypes(
198
- urn=urn,
199
- entity_type=entityType,
200
- subType=sub_type,
201
- aspects={aspectName},
202
- )
328
+
329
+ self._update_file_based_dict(urn, entityType, aspectName, mcp)
203
330
 
204
331
  def compute_stats(self) -> None:
205
332
  if self._file_based_dict is None:
206
333
  return
334
+
207
335
  query = """
208
336
  SELECT entityType, subTypes, aspects, count(*) as count
209
337
  FROM urn_aspects
@@ -223,11 +351,31 @@ class ExamplesReport(Report, Closeable):
223
351
  for aspect in aspects:
224
352
  entity_subtype_aspect_counts[entity_type][sub_type][aspect] += count
225
353
 
354
+ self.aspects.clear()
226
355
  self.aspects_by_subtypes.clear()
356
+ _aspects_seen: Set[str] = set()
227
357
  for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
228
358
  for sub_type, aspect_counts in subtype_counts.items():
359
+ for aspect, count in aspect_counts.items():
360
+ self.aspects[entity_type][aspect] += count
361
+ _aspects_seen.add(aspect)
229
362
  self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
230
363
 
364
+ self.samples.clear()
365
+ self._collect_samples_by_aspects(["datasetProfile"], "profiling")
366
+ self._collect_samples_by_aspects(
367
+ [
368
+ "datasetUsageStatistics",
369
+ "chartUsageStatistics",
370
+ "dashboardUsageStatistics",
371
+ ],
372
+ "usage",
373
+ )
374
+ self._collect_samples_by_lineage_aspects(
375
+ list(self._lineage_aspects_seen), "lineage"
376
+ )
377
+ self._collect_samples_with_all_conditions("all_3")
378
+
231
379
 
232
380
  class EntityFilterReport(ReportAttribute):
233
381
  type: str