acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3431 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +47 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/source.py +19 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +19 -9
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/report.py
CHANGED
|
@@ -6,19 +6,25 @@ from collections import defaultdict
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from datetime import datetime, timedelta
|
|
8
8
|
from enum import Enum
|
|
9
|
-
from typing import Any, Dict, Optional, Set, cast, runtime_checkable
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Union, cast, runtime_checkable
|
|
10
10
|
|
|
11
11
|
import humanfriendly
|
|
12
12
|
import pydantic
|
|
13
13
|
from pydantic import BaseModel
|
|
14
14
|
from typing_extensions import Literal, Protocol
|
|
15
15
|
|
|
16
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
17
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
17
18
|
from datahub.ingestion.api.closeable import Closeable
|
|
18
19
|
from datahub.ingestion.api.report_helpers import format_datetime_relative
|
|
19
20
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
|
+
from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
|
|
20
22
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
21
|
-
from datahub.metadata.schema_classes import
|
|
23
|
+
from datahub.metadata.schema_classes import (
|
|
24
|
+
MetadataChangeProposalClass,
|
|
25
|
+
SubTypesClass,
|
|
26
|
+
UpstreamLineageClass,
|
|
27
|
+
)
|
|
22
28
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
23
29
|
from datahub.utilities.lossy_collections import LossyList
|
|
24
30
|
|
|
@@ -125,8 +131,6 @@ class ReportAttribute(BaseModel):
|
|
|
125
131
|
|
|
126
132
|
@dataclass
|
|
127
133
|
class ExamplesReport(Report, Closeable):
|
|
128
|
-
_urns_seen: Set[str] = field(default_factory=set)
|
|
129
|
-
entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
|
|
130
134
|
aspects: Dict[str, Dict[str, int]] = field(
|
|
131
135
|
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
132
136
|
)
|
|
@@ -135,11 +139,16 @@ class ExamplesReport(Report, Closeable):
|
|
|
135
139
|
lambda: defaultdict(lambda: defaultdict(int))
|
|
136
140
|
)
|
|
137
141
|
)
|
|
138
|
-
|
|
139
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(
|
|
142
|
+
samples: Dict[str, Dict[str, List[str]]] = field(
|
|
143
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(list))
|
|
140
144
|
)
|
|
141
145
|
_file_based_dict: Optional[FileBackedDict[SourceReportSubtypes]] = None
|
|
142
146
|
|
|
147
|
+
# We are adding this to make querying easier for fine-grained lineage
|
|
148
|
+
_fine_grained_lineage_special_case_name = "fineGrainedLineages"
|
|
149
|
+
_samples_to_add: int = 20
|
|
150
|
+
_lineage_aspects_seen: Set[str] = field(default_factory=set)
|
|
151
|
+
|
|
143
152
|
def __post_init__(self) -> None:
|
|
144
153
|
self._file_based_dict = FileBackedDict(
|
|
145
154
|
tablename="urn_aspects",
|
|
@@ -157,6 +166,151 @@ class ExamplesReport(Report, Closeable):
|
|
|
157
166
|
self._file_based_dict.close()
|
|
158
167
|
self._file_based_dict = None
|
|
159
168
|
|
|
169
|
+
def _build_aspects_where_clause(self, aspects: List[str]) -> str:
|
|
170
|
+
"""Build WHERE clause for matching any of the given aspects."""
|
|
171
|
+
if not aspects:
|
|
172
|
+
return ""
|
|
173
|
+
|
|
174
|
+
conditions = []
|
|
175
|
+
for aspect in aspects:
|
|
176
|
+
conditions.append(f"aspects LIKE '%{aspect}%'")
|
|
177
|
+
|
|
178
|
+
return " OR ".join(conditions)
|
|
179
|
+
|
|
180
|
+
def _collect_samples_by_subtype(self, where_clause: str, sample_key: str) -> None:
|
|
181
|
+
"""Helper method to collect samples organized by subtype for a given where clause."""
|
|
182
|
+
|
|
183
|
+
subtype_query = f"""
|
|
184
|
+
SELECT DISTINCT subTypes
|
|
185
|
+
FROM urn_aspects
|
|
186
|
+
WHERE {where_clause}
|
|
187
|
+
"""
|
|
188
|
+
assert self._file_based_dict is not None
|
|
189
|
+
subtypes = set()
|
|
190
|
+
for row in self._file_based_dict.sql_query(subtype_query):
|
|
191
|
+
sub_type = row["subTypes"] or "unknown"
|
|
192
|
+
subtypes.add(sub_type)
|
|
193
|
+
|
|
194
|
+
for sub_type in subtypes:
|
|
195
|
+
query = f"""
|
|
196
|
+
SELECT urn
|
|
197
|
+
FROM urn_aspects
|
|
198
|
+
WHERE {where_clause} AND subTypes = ?
|
|
199
|
+
limit {self._samples_to_add}
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
for row in self._file_based_dict.sql_query(query, (sub_type,)):
|
|
203
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
204
|
+
|
|
205
|
+
def _collect_samples_by_aspects(self, aspects: List[str], sample_key: str) -> None:
|
|
206
|
+
"""Helper method to collect samples for entities that have any of the given aspects."""
|
|
207
|
+
if not aspects:
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
where_clause = self._build_aspects_where_clause(aspects)
|
|
211
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
212
|
+
|
|
213
|
+
def _collect_samples_by_lineage_aspects(
|
|
214
|
+
self, aspects: List[str], sample_key: str
|
|
215
|
+
) -> None:
|
|
216
|
+
"""Helper method to collect samples for entities that have any of the given lineage aspects.
|
|
217
|
+
|
|
218
|
+
Lineage aspects are stored in JSON format and require quote escaping in LIKE clauses.
|
|
219
|
+
"""
|
|
220
|
+
if not aspects:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
lineage_conditions = []
|
|
224
|
+
for aspect in aspects:
|
|
225
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
226
|
+
|
|
227
|
+
where_clause = " OR ".join(lineage_conditions)
|
|
228
|
+
self._collect_samples_by_subtype(where_clause, sample_key)
|
|
229
|
+
|
|
230
|
+
def _collect_samples_with_all_conditions(self, sample_key: str) -> None:
|
|
231
|
+
"""
|
|
232
|
+
Collect samples for entities that have lineage, profiling, and usage aspects.
|
|
233
|
+
These specific 3 cases are added here as these URNs will be shown in the UI. Subject to change in future.
|
|
234
|
+
"""
|
|
235
|
+
if not self._lineage_aspects_seen:
|
|
236
|
+
return
|
|
237
|
+
assert self._file_based_dict is not None
|
|
238
|
+
|
|
239
|
+
# Build lineage conditions using the same logic as _collect_samples_by_lineage_aspects
|
|
240
|
+
lineage_conditions = []
|
|
241
|
+
for aspect in self._lineage_aspects_seen:
|
|
242
|
+
lineage_conditions.append(f"aspects LIKE '%\"{aspect}\"%'")
|
|
243
|
+
lineage_where_clause = " OR ".join(lineage_conditions)
|
|
244
|
+
|
|
245
|
+
# Build profiling conditions using the same logic as _collect_samples_by_aspects
|
|
246
|
+
profiling_where_clause = self._build_aspects_where_clause(["datasetProfile"])
|
|
247
|
+
|
|
248
|
+
# Build usage conditions using the same logic as _collect_samples_by_aspects
|
|
249
|
+
usage_where_clause = self._build_aspects_where_clause(
|
|
250
|
+
[
|
|
251
|
+
"datasetUsageStatistics",
|
|
252
|
+
"chartUsageStatistics",
|
|
253
|
+
"dashboardUsageStatistics",
|
|
254
|
+
]
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
query = f"""
|
|
258
|
+
SELECT urn, subTypes
|
|
259
|
+
FROM urn_aspects
|
|
260
|
+
WHERE ({lineage_where_clause})
|
|
261
|
+
AND ({profiling_where_clause})
|
|
262
|
+
AND ({usage_where_clause})
|
|
263
|
+
limit {self._samples_to_add}
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
for row in self._file_based_dict.sql_query(query):
|
|
267
|
+
sub_type = row["subTypes"] or "unknown"
|
|
268
|
+
self.samples[sample_key][sub_type].append(row["urn"])
|
|
269
|
+
|
|
270
|
+
def _has_fine_grained_lineage(
|
|
271
|
+
self, mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper]
|
|
272
|
+
) -> bool:
|
|
273
|
+
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
274
|
+
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
275
|
+
if upstream_lineage.fineGrainedLineages:
|
|
276
|
+
return True
|
|
277
|
+
return False
|
|
278
|
+
|
|
279
|
+
def _update_file_based_dict(
|
|
280
|
+
self,
|
|
281
|
+
urn: str,
|
|
282
|
+
entityType: str,
|
|
283
|
+
aspectName: str,
|
|
284
|
+
mcp: Union[MetadataChangeProposalClass, MetadataChangeProposalWrapper],
|
|
285
|
+
) -> None:
|
|
286
|
+
if is_lineage_aspect(entityType, aspectName):
|
|
287
|
+
self._lineage_aspects_seen.add(aspectName)
|
|
288
|
+
has_fine_grained_lineage = self._has_fine_grained_lineage(mcp)
|
|
289
|
+
|
|
290
|
+
sub_type = "unknown"
|
|
291
|
+
if isinstance(mcp.aspect, SubTypesClass):
|
|
292
|
+
sub_type = mcp.aspect.typeNames[0]
|
|
293
|
+
|
|
294
|
+
assert self._file_based_dict is not None
|
|
295
|
+
if urn in self._file_based_dict:
|
|
296
|
+
if sub_type != "unknown":
|
|
297
|
+
self._file_based_dict[urn].subType = sub_type
|
|
298
|
+
self._file_based_dict[urn].aspects.add(aspectName)
|
|
299
|
+
if has_fine_grained_lineage:
|
|
300
|
+
self._file_based_dict[urn].aspects.add(
|
|
301
|
+
self._fine_grained_lineage_special_case_name
|
|
302
|
+
)
|
|
303
|
+
self._file_based_dict.mark_dirty(urn)
|
|
304
|
+
else:
|
|
305
|
+
self._file_based_dict[urn] = SourceReportSubtypes(
|
|
306
|
+
urn=urn,
|
|
307
|
+
entity_type=entityType,
|
|
308
|
+
subType=sub_type,
|
|
309
|
+
aspects={aspectName}
|
|
310
|
+
if not has_fine_grained_lineage
|
|
311
|
+
else {aspectName, self._fine_grained_lineage_special_case_name},
|
|
312
|
+
)
|
|
313
|
+
|
|
160
314
|
def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
|
|
161
315
|
urn = wu.get_urn()
|
|
162
316
|
|
|
@@ -169,41 +323,15 @@ class ExamplesReport(Report, Closeable):
|
|
|
169
323
|
entityType = mcp.entityType
|
|
170
324
|
aspectName = mcp.aspectName
|
|
171
325
|
|
|
172
|
-
if urn not in self._urns_seen:
|
|
173
|
-
self._urns_seen.add(urn)
|
|
174
|
-
self.entities[entityType].append(urn)
|
|
175
|
-
|
|
176
326
|
if aspectName is None:
|
|
177
327
|
continue
|
|
178
|
-
|
|
179
|
-
self.
|
|
180
|
-
sub_type = "unknown"
|
|
181
|
-
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
182
|
-
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
183
|
-
if upstream_lineage.fineGrainedLineages:
|
|
184
|
-
self.aspect_urn_samples[entityType]["fineGrainedLineages"].append(
|
|
185
|
-
urn
|
|
186
|
-
)
|
|
187
|
-
self.aspects[entityType]["fineGrainedLineages"] += 1
|
|
188
|
-
elif isinstance(mcp.aspect, SubTypesClass):
|
|
189
|
-
sub_type = mcp.aspect.typeNames[0]
|
|
190
|
-
assert self._file_based_dict is not None
|
|
191
|
-
if urn in self._file_based_dict:
|
|
192
|
-
if sub_type != "unknown":
|
|
193
|
-
self._file_based_dict[urn].subType = sub_type
|
|
194
|
-
self._file_based_dict[urn].aspects.add(aspectName)
|
|
195
|
-
self._file_based_dict.mark_dirty(urn)
|
|
196
|
-
else:
|
|
197
|
-
self._file_based_dict[urn] = SourceReportSubtypes(
|
|
198
|
-
urn=urn,
|
|
199
|
-
entity_type=entityType,
|
|
200
|
-
subType=sub_type,
|
|
201
|
-
aspects={aspectName},
|
|
202
|
-
)
|
|
328
|
+
|
|
329
|
+
self._update_file_based_dict(urn, entityType, aspectName, mcp)
|
|
203
330
|
|
|
204
331
|
def compute_stats(self) -> None:
|
|
205
332
|
if self._file_based_dict is None:
|
|
206
333
|
return
|
|
334
|
+
|
|
207
335
|
query = """
|
|
208
336
|
SELECT entityType, subTypes, aspects, count(*) as count
|
|
209
337
|
FROM urn_aspects
|
|
@@ -223,11 +351,31 @@ class ExamplesReport(Report, Closeable):
|
|
|
223
351
|
for aspect in aspects:
|
|
224
352
|
entity_subtype_aspect_counts[entity_type][sub_type][aspect] += count
|
|
225
353
|
|
|
354
|
+
self.aspects.clear()
|
|
226
355
|
self.aspects_by_subtypes.clear()
|
|
356
|
+
_aspects_seen: Set[str] = set()
|
|
227
357
|
for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
|
|
228
358
|
for sub_type, aspect_counts in subtype_counts.items():
|
|
359
|
+
for aspect, count in aspect_counts.items():
|
|
360
|
+
self.aspects[entity_type][aspect] += count
|
|
361
|
+
_aspects_seen.add(aspect)
|
|
229
362
|
self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
|
|
230
363
|
|
|
364
|
+
self.samples.clear()
|
|
365
|
+
self._collect_samples_by_aspects(["datasetProfile"], "profiling")
|
|
366
|
+
self._collect_samples_by_aspects(
|
|
367
|
+
[
|
|
368
|
+
"datasetUsageStatistics",
|
|
369
|
+
"chartUsageStatistics",
|
|
370
|
+
"dashboardUsageStatistics",
|
|
371
|
+
],
|
|
372
|
+
"usage",
|
|
373
|
+
)
|
|
374
|
+
self._collect_samples_by_lineage_aspects(
|
|
375
|
+
list(self._lineage_aspects_seen), "lineage"
|
|
376
|
+
)
|
|
377
|
+
self._collect_samples_with_all_conditions("all_3")
|
|
378
|
+
|
|
231
379
|
|
|
232
380
|
class EntityFilterReport(ReportAttribute):
|
|
233
381
|
type: str
|