acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2563 -2561
  2. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +46 -44
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataproduct/dataproduct.py +26 -0
  5. datahub/cli/config_utils.py +18 -10
  6. datahub/cli/docker_check.py +2 -1
  7. datahub/cli/docker_cli.py +4 -2
  8. datahub/cli/graphql_cli.py +1422 -0
  9. datahub/cli/quickstart_versioning.py +2 -2
  10. datahub/cli/specific/dataproduct_cli.py +2 -4
  11. datahub/cli/specific/user_cli.py +172 -1
  12. datahub/configuration/env_vars.py +331 -0
  13. datahub/configuration/kafka.py +6 -4
  14. datahub/emitter/mce_builder.py +2 -4
  15. datahub/emitter/rest_emitter.py +15 -15
  16. datahub/entrypoints.py +2 -0
  17. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  18. datahub/ingestion/api/source.py +5 -0
  19. datahub/ingestion/graph/client.py +197 -0
  20. datahub/ingestion/graph/config.py +2 -2
  21. datahub/ingestion/sink/datahub_rest.py +6 -5
  22. datahub/ingestion/source/aws/aws_common.py +20 -13
  23. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
  24. datahub/ingestion/source/grafana/models.py +5 -0
  25. datahub/ingestion/source/iceberg/iceberg.py +39 -19
  26. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
  27. datahub/ingestion/source/mode.py +13 -0
  28. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
  30. datahub/ingestion/source/sql/mssql/source.py +7 -1
  31. datahub/ingestion/source/sql/teradata.py +80 -65
  32. datahub/ingestion/source/unity/config.py +31 -0
  33. datahub/ingestion/source/unity/proxy.py +73 -0
  34. datahub/ingestion/source/unity/source.py +27 -70
  35. datahub/ingestion/source/unity/usage.py +46 -4
  36. datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
  37. datahub/sql_parsing/sqlglot_lineage.py +7 -0
  38. datahub/telemetry/telemetry.py +8 -3
  39. datahub/utilities/file_backed_collections.py +2 -2
  40. datahub/utilities/is_pytest.py +3 -2
  41. datahub/utilities/logging_manager.py +22 -6
  42. datahub/utilities/sample_data.py +5 -4
  43. datahub/emitter/sql_parsing_builder.py +0 -306
  44. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
  47. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0
@@ -1,306 +0,0 @@
1
- import logging
2
- import time
3
- from collections import defaultdict
4
- from dataclasses import dataclass, field
5
- from datetime import datetime
6
- from typing import Collection, Dict, Iterable, List, Optional, Set
7
-
8
- from datahub.emitter.mce_builder import make_schema_field_urn
9
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
10
- from datahub.ingestion.api.workunit import MetadataWorkUnit
11
- from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator
12
- from datahub.metadata.schema_classes import (
13
- AuditStampClass,
14
- DatasetLineageTypeClass,
15
- FineGrainedLineageClass,
16
- FineGrainedLineageDownstreamTypeClass,
17
- FineGrainedLineageUpstreamTypeClass,
18
- OperationClass,
19
- OperationTypeClass,
20
- UpstreamClass,
21
- UpstreamLineageClass,
22
- )
23
- from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
24
- from datahub.utilities.file_backed_collections import FileBackedDict
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
- # TODO: Use this over other sources' equivalent code, if possible
29
-
30
- DatasetUrn = str
31
- FieldUrn = str
32
- UserUrn = str
33
-
34
-
35
- @dataclass
36
- class LineageEdge:
37
- """Stores information about a single lineage edge, from an upstream table to a downstream table."""
38
-
39
- downstream_urn: DatasetUrn
40
- upstream_urn: DatasetUrn
41
- audit_stamp: Optional[datetime]
42
- actor: Optional[UserUrn]
43
- type: str = DatasetLineageTypeClass.TRANSFORMED
44
-
45
- # Maps downstream_col -> {upstream_col}
46
- column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
47
-
48
- def gen_upstream_aspect(self) -> UpstreamClass:
49
- return UpstreamClass(
50
- auditStamp=(
51
- AuditStampClass(
52
- time=int(self.audit_stamp.timestamp() * 1000),
53
- actor=self.actor or "",
54
- )
55
- if self.audit_stamp
56
- else None
57
- ),
58
- dataset=self.upstream_urn,
59
- type=self.type,
60
- )
61
-
62
- def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]:
63
- for downstream_col, upstream_cols in self.column_map.items():
64
- yield FineGrainedLineageClass(
65
- upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
66
- # Sort to avoid creating multiple aspects in backend with same lineage but different order
67
- upstreams=sorted(
68
- make_schema_field_urn(self.upstream_urn, col)
69
- for col in upstream_cols
70
- ),
71
- downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
72
- downstreams=[
73
- make_schema_field_urn(self.downstream_urn, downstream_col)
74
- ],
75
- )
76
-
77
-
78
- @dataclass
79
- class SqlParsingBuilder:
80
- # Open question: does it make sense to iterate over out_tables? When will we have multiple?
81
-
82
- generate_lineage: bool = True
83
- generate_usage_statistics: bool = True
84
- generate_operations: bool = True
85
- usage_config: Optional[BaseUsageConfig] = None
86
-
87
- # Maps downstream urn -> upstream urn -> LineageEdge
88
- # Builds up a single LineageEdge for each upstream -> downstream pair
89
- _lineage_map: FileBackedDict[Dict[DatasetUrn, LineageEdge]] = field(
90
- default_factory=FileBackedDict, init=False
91
- )
92
-
93
- # TODO: Replace with FileBackedDict approach like in BigQuery usage
94
- _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False)
95
-
96
- def __post_init__(self) -> None:
97
- if self.usage_config:
98
- self._usage_aggregator = UsageAggregator(self.usage_config)
99
- elif self.generate_usage_statistics:
100
- logger.info("No usage config provided, not generating usage statistics")
101
- self.generate_usage_statistics = False
102
-
103
- def process_sql_parsing_result(
104
- self,
105
- result: SqlParsingResult,
106
- *,
107
- query: str,
108
- query_timestamp: Optional[datetime] = None,
109
- is_view_ddl: bool = False,
110
- user: Optional[UserUrn] = None,
111
- custom_operation_type: Optional[str] = None,
112
- include_urns: Optional[Set[DatasetUrn]] = None,
113
- include_column_lineage: bool = True,
114
- ) -> Iterable[MetadataWorkUnit]:
115
- """Process a single query and yield any generated workunits.
116
-
117
- Args:
118
- result: The result of parsing the query, or a mock result if parsing failed.
119
- query: The SQL query to parse and process.
120
- query_timestamp: When the query was run.
121
- is_view_ddl: Whether the query is a DDL statement that creates a view.
122
- user: The urn of the user who ran the query.
123
- custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed.
124
- include_urns: If provided, only generate workunits for these urns.
125
- """
126
- downstreams_to_ingest = result.out_tables
127
- upstreams_to_ingest = result.in_tables
128
- if include_urns:
129
- logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}")
130
- downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns)
131
- upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns)
132
-
133
- if self.generate_lineage:
134
- for downstream_urn in downstreams_to_ingest:
135
- # Set explicitly so that FileBackedDict registers any mutations
136
- self._lineage_map[downstream_urn] = _merge_lineage_data(
137
- downstream_urn=downstream_urn,
138
- upstream_urns=result.in_tables,
139
- column_lineage=(
140
- result.column_lineage if include_column_lineage else None
141
- ),
142
- upstream_edges=self._lineage_map.get(downstream_urn, {}),
143
- query_timestamp=query_timestamp,
144
- is_view_ddl=is_view_ddl,
145
- user=user,
146
- )
147
-
148
- if self.generate_usage_statistics and query_timestamp is not None:
149
- upstream_fields = compute_upstream_fields(result)
150
- for upstream_urn in upstreams_to_ingest:
151
- self._usage_aggregator.aggregate_event(
152
- resource=upstream_urn,
153
- start_time=query_timestamp,
154
- query=query,
155
- user=user,
156
- fields=sorted(upstream_fields.get(upstream_urn, [])),
157
- )
158
-
159
- if self.generate_operations and query_timestamp is not None:
160
- for downstream_urn in downstreams_to_ingest:
161
- yield from _gen_operation_workunit(
162
- result,
163
- downstream_urn=downstream_urn,
164
- query_timestamp=query_timestamp,
165
- user=user,
166
- custom_operation_type=custom_operation_type,
167
- )
168
-
169
- def add_lineage(
170
- self,
171
- downstream_urn: DatasetUrn,
172
- upstream_urns: Collection[DatasetUrn],
173
- timestamp: Optional[datetime] = None,
174
- is_view_ddl: bool = False,
175
- user: Optional[UserUrn] = None,
176
- ) -> None:
177
- """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails."""
178
- # Set explicitly so that FileBackedDict registers any mutations
179
- self._lineage_map[downstream_urn] = _merge_lineage_data(
180
- downstream_urn=downstream_urn,
181
- upstream_urns=upstream_urns,
182
- column_lineage=None,
183
- upstream_edges=self._lineage_map.get(downstream_urn, {}),
184
- query_timestamp=timestamp,
185
- is_view_ddl=is_view_ddl,
186
- user=user,
187
- )
188
-
189
- def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
190
- if self.generate_lineage:
191
- for mcp in self._gen_lineage_mcps():
192
- yield mcp.as_workunit()
193
- if self.generate_usage_statistics:
194
- yield from self._gen_usage_statistics_workunits()
195
-
196
- def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
197
- for downstream_urn in self._lineage_map:
198
- upstreams: List[UpstreamClass] = []
199
- fine_upstreams: List[FineGrainedLineageClass] = []
200
- for edge in self._lineage_map[downstream_urn].values():
201
- upstreams.append(edge.gen_upstream_aspect())
202
- fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
203
-
204
- if not upstreams:
205
- continue
206
-
207
- upstream_lineage = UpstreamLineageClass(
208
- upstreams=sorted(upstreams, key=lambda x: x.dataset),
209
- fineGrainedLineages=sorted(
210
- fine_upstreams,
211
- key=lambda x: (x.downstreams, x.upstreams),
212
- )
213
- or None,
214
- )
215
- yield MetadataChangeProposalWrapper(
216
- entityUrn=downstream_urn, aspect=upstream_lineage
217
- )
218
-
219
- def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
220
- yield from self._usage_aggregator.generate_workunits(
221
- resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn
222
- )
223
-
224
-
225
- def _merge_lineage_data(
226
- downstream_urn: DatasetUrn,
227
- *,
228
- upstream_urns: Collection[DatasetUrn],
229
- column_lineage: Optional[List[ColumnLineageInfo]],
230
- upstream_edges: Dict[DatasetUrn, LineageEdge],
231
- query_timestamp: Optional[datetime],
232
- is_view_ddl: bool,
233
- user: Optional[UserUrn],
234
- ) -> Dict[str, LineageEdge]:
235
- for upstream_urn in upstream_urns:
236
- edge = upstream_edges.setdefault(
237
- upstream_urn,
238
- LineageEdge(
239
- downstream_urn=downstream_urn,
240
- upstream_urn=upstream_urn,
241
- audit_stamp=query_timestamp,
242
- actor=user,
243
- type=(
244
- DatasetLineageTypeClass.VIEW
245
- if is_view_ddl
246
- else DatasetLineageTypeClass.TRANSFORMED
247
- ),
248
- ),
249
- )
250
- if query_timestamp and ( # Use the most recent query
251
- edge.audit_stamp is None or query_timestamp > edge.audit_stamp
252
- ):
253
- edge.audit_stamp = query_timestamp
254
- if user:
255
- edge.actor = user
256
-
257
- # Note: Inefficient as we loop through all column_lineage entries for each downstream table
258
- for cl in column_lineage or []:
259
- if cl.downstream.table == downstream_urn:
260
- for upstream_column_info in cl.upstreams:
261
- if upstream_column_info.table not in upstream_urns:
262
- continue
263
- column_map = upstream_edges[upstream_column_info.table].column_map
264
- column_map[cl.downstream.column].add(upstream_column_info.column)
265
-
266
- return upstream_edges
267
-
268
-
269
- def compute_upstream_fields(
270
- result: SqlParsingResult,
271
- ) -> Dict[DatasetUrn, Set[DatasetUrn]]:
272
- upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set)
273
- for cl in result.column_lineage or []:
274
- for upstream in cl.upstreams:
275
- upstream_fields[upstream.table].add(upstream.column)
276
- return upstream_fields
277
-
278
-
279
- def _gen_operation_workunit(
280
- result: SqlParsingResult,
281
- *,
282
- downstream_urn: DatasetUrn,
283
- query_timestamp: datetime,
284
- user: Optional[UserUrn],
285
- custom_operation_type: Optional[str],
286
- ) -> Iterable[MetadataWorkUnit]:
287
- operation_type = result.query_type.to_operation_type()
288
- # Filter out SELECT and other undesired statements
289
- if operation_type is None:
290
- return
291
- elif operation_type == OperationTypeClass.UNKNOWN:
292
- if custom_operation_type is None:
293
- return
294
- else:
295
- operation_type = OperationTypeClass.CUSTOM
296
-
297
- aspect = OperationClass(
298
- timestampMillis=int(time.time() * 1000),
299
- operationType=operation_type,
300
- lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000),
301
- actor=user,
302
- customOperationType=custom_operation_type,
303
- )
304
- yield MetadataChangeProposalWrapper(
305
- entityUrn=downstream_urn, aspect=aspect
306
- ).as_workunit()