acryl-datahub 1.3.0.1rc1__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2539 -2537
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +47 -45
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/delta_lake/config.py +8 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -1,306 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import time
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from typing import Collection, Dict, Iterable, List, Optional, Set
|
|
7
|
-
|
|
8
|
-
from datahub.emitter.mce_builder import make_schema_field_urn
|
|
9
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
10
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
11
|
-
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator
|
|
12
|
-
from datahub.metadata.schema_classes import (
|
|
13
|
-
AuditStampClass,
|
|
14
|
-
DatasetLineageTypeClass,
|
|
15
|
-
FineGrainedLineageClass,
|
|
16
|
-
FineGrainedLineageDownstreamTypeClass,
|
|
17
|
-
FineGrainedLineageUpstreamTypeClass,
|
|
18
|
-
OperationClass,
|
|
19
|
-
OperationTypeClass,
|
|
20
|
-
UpstreamClass,
|
|
21
|
-
UpstreamLineageClass,
|
|
22
|
-
)
|
|
23
|
-
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
|
|
24
|
-
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
25
|
-
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
# TODO: Use this over other sources' equivalent code, if possible
|
|
29
|
-
|
|
30
|
-
DatasetUrn = str
|
|
31
|
-
FieldUrn = str
|
|
32
|
-
UserUrn = str
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class LineageEdge:
|
|
37
|
-
"""Stores information about a single lineage edge, from an upstream table to a downstream table."""
|
|
38
|
-
|
|
39
|
-
downstream_urn: DatasetUrn
|
|
40
|
-
upstream_urn: DatasetUrn
|
|
41
|
-
audit_stamp: Optional[datetime]
|
|
42
|
-
actor: Optional[UserUrn]
|
|
43
|
-
type: str = DatasetLineageTypeClass.TRANSFORMED
|
|
44
|
-
|
|
45
|
-
# Maps downstream_col -> {upstream_col}
|
|
46
|
-
column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
|
|
47
|
-
|
|
48
|
-
def gen_upstream_aspect(self) -> UpstreamClass:
|
|
49
|
-
return UpstreamClass(
|
|
50
|
-
auditStamp=(
|
|
51
|
-
AuditStampClass(
|
|
52
|
-
time=int(self.audit_stamp.timestamp() * 1000),
|
|
53
|
-
actor=self.actor or "",
|
|
54
|
-
)
|
|
55
|
-
if self.audit_stamp
|
|
56
|
-
else None
|
|
57
|
-
),
|
|
58
|
-
dataset=self.upstream_urn,
|
|
59
|
-
type=self.type,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]:
|
|
63
|
-
for downstream_col, upstream_cols in self.column_map.items():
|
|
64
|
-
yield FineGrainedLineageClass(
|
|
65
|
-
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
66
|
-
# Sort to avoid creating multiple aspects in backend with same lineage but different order
|
|
67
|
-
upstreams=sorted(
|
|
68
|
-
make_schema_field_urn(self.upstream_urn, col)
|
|
69
|
-
for col in upstream_cols
|
|
70
|
-
),
|
|
71
|
-
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
72
|
-
downstreams=[
|
|
73
|
-
make_schema_field_urn(self.downstream_urn, downstream_col)
|
|
74
|
-
],
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@dataclass
|
|
79
|
-
class SqlParsingBuilder:
|
|
80
|
-
# Open question: does it make sense to iterate over out_tables? When will we have multiple?
|
|
81
|
-
|
|
82
|
-
generate_lineage: bool = True
|
|
83
|
-
generate_usage_statistics: bool = True
|
|
84
|
-
generate_operations: bool = True
|
|
85
|
-
usage_config: Optional[BaseUsageConfig] = None
|
|
86
|
-
|
|
87
|
-
# Maps downstream urn -> upstream urn -> LineageEdge
|
|
88
|
-
# Builds up a single LineageEdge for each upstream -> downstream pair
|
|
89
|
-
_lineage_map: FileBackedDict[Dict[DatasetUrn, LineageEdge]] = field(
|
|
90
|
-
default_factory=FileBackedDict, init=False
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# TODO: Replace with FileBackedDict approach like in BigQuery usage
|
|
94
|
-
_usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False)
|
|
95
|
-
|
|
96
|
-
def __post_init__(self) -> None:
|
|
97
|
-
if self.usage_config:
|
|
98
|
-
self._usage_aggregator = UsageAggregator(self.usage_config)
|
|
99
|
-
elif self.generate_usage_statistics:
|
|
100
|
-
logger.info("No usage config provided, not generating usage statistics")
|
|
101
|
-
self.generate_usage_statistics = False
|
|
102
|
-
|
|
103
|
-
def process_sql_parsing_result(
|
|
104
|
-
self,
|
|
105
|
-
result: SqlParsingResult,
|
|
106
|
-
*,
|
|
107
|
-
query: str,
|
|
108
|
-
query_timestamp: Optional[datetime] = None,
|
|
109
|
-
is_view_ddl: bool = False,
|
|
110
|
-
user: Optional[UserUrn] = None,
|
|
111
|
-
custom_operation_type: Optional[str] = None,
|
|
112
|
-
include_urns: Optional[Set[DatasetUrn]] = None,
|
|
113
|
-
include_column_lineage: bool = True,
|
|
114
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
115
|
-
"""Process a single query and yield any generated workunits.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
result: The result of parsing the query, or a mock result if parsing failed.
|
|
119
|
-
query: The SQL query to parse and process.
|
|
120
|
-
query_timestamp: When the query was run.
|
|
121
|
-
is_view_ddl: Whether the query is a DDL statement that creates a view.
|
|
122
|
-
user: The urn of the user who ran the query.
|
|
123
|
-
custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed.
|
|
124
|
-
include_urns: If provided, only generate workunits for these urns.
|
|
125
|
-
"""
|
|
126
|
-
downstreams_to_ingest = result.out_tables
|
|
127
|
-
upstreams_to_ingest = result.in_tables
|
|
128
|
-
if include_urns:
|
|
129
|
-
logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}")
|
|
130
|
-
downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns)
|
|
131
|
-
upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns)
|
|
132
|
-
|
|
133
|
-
if self.generate_lineage:
|
|
134
|
-
for downstream_urn in downstreams_to_ingest:
|
|
135
|
-
# Set explicitly so that FileBackedDict registers any mutations
|
|
136
|
-
self._lineage_map[downstream_urn] = _merge_lineage_data(
|
|
137
|
-
downstream_urn=downstream_urn,
|
|
138
|
-
upstream_urns=result.in_tables,
|
|
139
|
-
column_lineage=(
|
|
140
|
-
result.column_lineage if include_column_lineage else None
|
|
141
|
-
),
|
|
142
|
-
upstream_edges=self._lineage_map.get(downstream_urn, {}),
|
|
143
|
-
query_timestamp=query_timestamp,
|
|
144
|
-
is_view_ddl=is_view_ddl,
|
|
145
|
-
user=user,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
if self.generate_usage_statistics and query_timestamp is not None:
|
|
149
|
-
upstream_fields = compute_upstream_fields(result)
|
|
150
|
-
for upstream_urn in upstreams_to_ingest:
|
|
151
|
-
self._usage_aggregator.aggregate_event(
|
|
152
|
-
resource=upstream_urn,
|
|
153
|
-
start_time=query_timestamp,
|
|
154
|
-
query=query,
|
|
155
|
-
user=user,
|
|
156
|
-
fields=sorted(upstream_fields.get(upstream_urn, [])),
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
if self.generate_operations and query_timestamp is not None:
|
|
160
|
-
for downstream_urn in downstreams_to_ingest:
|
|
161
|
-
yield from _gen_operation_workunit(
|
|
162
|
-
result,
|
|
163
|
-
downstream_urn=downstream_urn,
|
|
164
|
-
query_timestamp=query_timestamp,
|
|
165
|
-
user=user,
|
|
166
|
-
custom_operation_type=custom_operation_type,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
def add_lineage(
|
|
170
|
-
self,
|
|
171
|
-
downstream_urn: DatasetUrn,
|
|
172
|
-
upstream_urns: Collection[DatasetUrn],
|
|
173
|
-
timestamp: Optional[datetime] = None,
|
|
174
|
-
is_view_ddl: bool = False,
|
|
175
|
-
user: Optional[UserUrn] = None,
|
|
176
|
-
) -> None:
|
|
177
|
-
"""Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails."""
|
|
178
|
-
# Set explicitly so that FileBackedDict registers any mutations
|
|
179
|
-
self._lineage_map[downstream_urn] = _merge_lineage_data(
|
|
180
|
-
downstream_urn=downstream_urn,
|
|
181
|
-
upstream_urns=upstream_urns,
|
|
182
|
-
column_lineage=None,
|
|
183
|
-
upstream_edges=self._lineage_map.get(downstream_urn, {}),
|
|
184
|
-
query_timestamp=timestamp,
|
|
185
|
-
is_view_ddl=is_view_ddl,
|
|
186
|
-
user=user,
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
190
|
-
if self.generate_lineage:
|
|
191
|
-
for mcp in self._gen_lineage_mcps():
|
|
192
|
-
yield mcp.as_workunit()
|
|
193
|
-
if self.generate_usage_statistics:
|
|
194
|
-
yield from self._gen_usage_statistics_workunits()
|
|
195
|
-
|
|
196
|
-
def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
197
|
-
for downstream_urn in self._lineage_map:
|
|
198
|
-
upstreams: List[UpstreamClass] = []
|
|
199
|
-
fine_upstreams: List[FineGrainedLineageClass] = []
|
|
200
|
-
for edge in self._lineage_map[downstream_urn].values():
|
|
201
|
-
upstreams.append(edge.gen_upstream_aspect())
|
|
202
|
-
fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
|
|
203
|
-
|
|
204
|
-
if not upstreams:
|
|
205
|
-
continue
|
|
206
|
-
|
|
207
|
-
upstream_lineage = UpstreamLineageClass(
|
|
208
|
-
upstreams=sorted(upstreams, key=lambda x: x.dataset),
|
|
209
|
-
fineGrainedLineages=sorted(
|
|
210
|
-
fine_upstreams,
|
|
211
|
-
key=lambda x: (x.downstreams, x.upstreams),
|
|
212
|
-
)
|
|
213
|
-
or None,
|
|
214
|
-
)
|
|
215
|
-
yield MetadataChangeProposalWrapper(
|
|
216
|
-
entityUrn=downstream_urn, aspect=upstream_lineage
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
220
|
-
yield from self._usage_aggregator.generate_workunits(
|
|
221
|
-
resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def _merge_lineage_data(
|
|
226
|
-
downstream_urn: DatasetUrn,
|
|
227
|
-
*,
|
|
228
|
-
upstream_urns: Collection[DatasetUrn],
|
|
229
|
-
column_lineage: Optional[List[ColumnLineageInfo]],
|
|
230
|
-
upstream_edges: Dict[DatasetUrn, LineageEdge],
|
|
231
|
-
query_timestamp: Optional[datetime],
|
|
232
|
-
is_view_ddl: bool,
|
|
233
|
-
user: Optional[UserUrn],
|
|
234
|
-
) -> Dict[str, LineageEdge]:
|
|
235
|
-
for upstream_urn in upstream_urns:
|
|
236
|
-
edge = upstream_edges.setdefault(
|
|
237
|
-
upstream_urn,
|
|
238
|
-
LineageEdge(
|
|
239
|
-
downstream_urn=downstream_urn,
|
|
240
|
-
upstream_urn=upstream_urn,
|
|
241
|
-
audit_stamp=query_timestamp,
|
|
242
|
-
actor=user,
|
|
243
|
-
type=(
|
|
244
|
-
DatasetLineageTypeClass.VIEW
|
|
245
|
-
if is_view_ddl
|
|
246
|
-
else DatasetLineageTypeClass.TRANSFORMED
|
|
247
|
-
),
|
|
248
|
-
),
|
|
249
|
-
)
|
|
250
|
-
if query_timestamp and ( # Use the most recent query
|
|
251
|
-
edge.audit_stamp is None or query_timestamp > edge.audit_stamp
|
|
252
|
-
):
|
|
253
|
-
edge.audit_stamp = query_timestamp
|
|
254
|
-
if user:
|
|
255
|
-
edge.actor = user
|
|
256
|
-
|
|
257
|
-
# Note: Inefficient as we loop through all column_lineage entries for each downstream table
|
|
258
|
-
for cl in column_lineage or []:
|
|
259
|
-
if cl.downstream.table == downstream_urn:
|
|
260
|
-
for upstream_column_info in cl.upstreams:
|
|
261
|
-
if upstream_column_info.table not in upstream_urns:
|
|
262
|
-
continue
|
|
263
|
-
column_map = upstream_edges[upstream_column_info.table].column_map
|
|
264
|
-
column_map[cl.downstream.column].add(upstream_column_info.column)
|
|
265
|
-
|
|
266
|
-
return upstream_edges
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def compute_upstream_fields(
|
|
270
|
-
result: SqlParsingResult,
|
|
271
|
-
) -> Dict[DatasetUrn, Set[DatasetUrn]]:
|
|
272
|
-
upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set)
|
|
273
|
-
for cl in result.column_lineage or []:
|
|
274
|
-
for upstream in cl.upstreams:
|
|
275
|
-
upstream_fields[upstream.table].add(upstream.column)
|
|
276
|
-
return upstream_fields
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
def _gen_operation_workunit(
|
|
280
|
-
result: SqlParsingResult,
|
|
281
|
-
*,
|
|
282
|
-
downstream_urn: DatasetUrn,
|
|
283
|
-
query_timestamp: datetime,
|
|
284
|
-
user: Optional[UserUrn],
|
|
285
|
-
custom_operation_type: Optional[str],
|
|
286
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
287
|
-
operation_type = result.query_type.to_operation_type()
|
|
288
|
-
# Filter out SELECT and other undesired statements
|
|
289
|
-
if operation_type is None:
|
|
290
|
-
return
|
|
291
|
-
elif operation_type == OperationTypeClass.UNKNOWN:
|
|
292
|
-
if custom_operation_type is None:
|
|
293
|
-
return
|
|
294
|
-
else:
|
|
295
|
-
operation_type = OperationTypeClass.CUSTOM
|
|
296
|
-
|
|
297
|
-
aspect = OperationClass(
|
|
298
|
-
timestampMillis=int(time.time() * 1000),
|
|
299
|
-
operationType=operation_type,
|
|
300
|
-
lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000),
|
|
301
|
-
actor=user,
|
|
302
|
-
customOperationType=custom_operation_type,
|
|
303
|
-
)
|
|
304
|
-
yield MetadataChangeProposalWrapper(
|
|
305
|
-
entityUrn=downstream_urn, aspect=aspect
|
|
306
|
-
).as_workunit()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|