acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/METADATA +2526 -2526
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/RECORD +38 -35
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/emitter/rest_emitter.py +18 -1
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/run/pipeline.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +12 -0
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -0
- datahub/ingestion/source/dremio/dremio_api.py +98 -68
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +90 -77
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/ge_data_profiler.py +48 -8
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +384 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +17 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/metadata/_internal_schema_classes.py +3 -0
- datahub/metadata/schema.avsc +2 -0
- datahub/metadata/schemas/ContainerProperties.avsc +2 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
- datahub/metadata/schemas/DataJobInfo.avsc +2 -0
- datahub/metadata/schemas/DataProcessKey.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
- datahub/metadata/schemas/MLModelKey.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from datahub.configuration.common import ConfigModel
|
|
7
|
+
from datahub.emitter.mce_builder import make_dataset_urn
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.api.decorators import (
|
|
11
|
+
SupportStatus,
|
|
12
|
+
config_class,
|
|
13
|
+
platform_name,
|
|
14
|
+
support_status,
|
|
15
|
+
)
|
|
16
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
17
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
|
+
from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
|
|
19
|
+
from datahub.metadata.schema_classes import (
|
|
20
|
+
DatasetLineageTypeClass,
|
|
21
|
+
StatusClass,
|
|
22
|
+
SubTypesClass,
|
|
23
|
+
UpstreamClass,
|
|
24
|
+
UpstreamLineageClass,
|
|
25
|
+
)
|
|
26
|
+
from datahub.utilities.str_enum import StrEnum
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SubTypePattern(StrEnum):
|
|
32
|
+
ALTERNATING = "alternating"
|
|
33
|
+
ALL_TABLE = "all_table"
|
|
34
|
+
ALL_VIEW = "all_view"
|
|
35
|
+
LEVEL_BASED = "level_based"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LineageConfigGen1(ConfigModel):
|
|
39
|
+
"""
|
|
40
|
+
Configuration for generating mock lineage data for testing purposes.
|
|
41
|
+
|
|
42
|
+
This configuration controls how the mock data source generates a hierarchical
|
|
43
|
+
lineage graph with multiple levels of upstream/downstream relationships.
|
|
44
|
+
|
|
45
|
+
The lineage graph is structured as follows:
|
|
46
|
+
- Level 0: 1 table (root)
|
|
47
|
+
- Level 1: lineage_fan_out tables (each connected to the root)
|
|
48
|
+
- Level 2+: If lineage_fan_out_after_first_hop is set, uses that value;
|
|
49
|
+
otherwise uses lineage_fan_out^level tables (each connected to a level 1 table)
|
|
50
|
+
- ... and so on for lineage_hops levels
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
- With lineage_fan_out=2, lineage_hops=1: Creates 3 tables total
|
|
54
|
+
(1 root + 2 downstream) with 2 lineage relationships
|
|
55
|
+
- With lineage_fan_out=3, lineage_hops=2: Creates 13 tables total
|
|
56
|
+
(1 + 3 + 9) with 12 lineage relationships
|
|
57
|
+
- With lineage_fan_out=4, lineage_hops=1: Creates 5 tables total
|
|
58
|
+
(1 + 4) with 4 lineage relationships
|
|
59
|
+
- With lineage_fan_out=3, lineage_hops=3, lineage_fan_out_after_first_hop=2:
|
|
60
|
+
Creates 1 + 3 + 6 + 12 = 22 tables total (prevents exponential growth)
|
|
61
|
+
|
|
62
|
+
Table naming convention: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
emit_lineage: bool = Field(
|
|
66
|
+
default=False,
|
|
67
|
+
description="Whether to emit lineage data for testing purposes. When False, no lineage data is generated regardless of other settings.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
lineage_fan_out: int = Field(
|
|
71
|
+
default=3,
|
|
72
|
+
description="Number of downstream tables that each upstream table connects to. This controls the 'width' of the lineage graph. Higher values create more parallel downstream tables per level.",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
lineage_hops: int = Field(
|
|
76
|
+
default=2,
|
|
77
|
+
description="Number of hops (levels) in the lineage graph. This controls the 'depth' of the lineage graph. Level 0 is the root table, and each subsequent level contains downstream tables. Higher values create deeper lineage chains.",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
lineage_fan_out_after_first_hop: Optional[int] = Field(
|
|
81
|
+
default=None,
|
|
82
|
+
description="Optional limit on fanout for hops after the first hop. When set, prevents exponential growth by limiting the number of downstream tables per upstream table at levels 2 and beyond. When None, uses the standard exponential growth (lineage_fan_out^level).",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
subtype_pattern: SubTypePattern = Field(
|
|
86
|
+
default=SubTypePattern.ALTERNATING,
|
|
87
|
+
description="Pattern for determining SubTypes. Options: 'alternating', 'all_table', 'all_view', 'level_based'",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
level_subtypes: Dict[int, str] = Field(
|
|
91
|
+
default={0: "Table", 1: "View", 2: "Table"},
|
|
92
|
+
description="Mapping of level to subtype for level_based pattern",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DataHubMockDataConfig(ConfigModel):
|
|
97
|
+
enabled: bool = Field(
|
|
98
|
+
default=True,
|
|
99
|
+
description="Whether this source is enabled",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
gen_1: LineageConfigGen1 = Field(
|
|
103
|
+
default_factory=LineageConfigGen1,
|
|
104
|
+
description="Configuration for lineage data generation",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@platform_name("DataHubMockData")
|
|
109
|
+
@config_class(DataHubMockDataConfig)
|
|
110
|
+
@support_status(SupportStatus.TESTING)
|
|
111
|
+
class DataHubMockDataSource(Source):
|
|
112
|
+
"""
|
|
113
|
+
This source is for generating mock data for testing purposes.
|
|
114
|
+
Expect breaking changes as we iterate on the mock data source.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, ctx: PipelineContext, config: DataHubMockDataConfig):
|
|
118
|
+
self.ctx = ctx
|
|
119
|
+
self.config = config
|
|
120
|
+
self.report = SourceReport()
|
|
121
|
+
|
|
122
|
+
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
123
|
+
# We don't want any implicit aspects to be produced
|
|
124
|
+
# so we are not using get_workunits_internal
|
|
125
|
+
if self.config.gen_1.emit_lineage:
|
|
126
|
+
for wu in self._data_gen_1():
|
|
127
|
+
self.report.report_workunit(wu)
|
|
128
|
+
yield wu
|
|
129
|
+
|
|
130
|
+
yield from []
|
|
131
|
+
|
|
132
|
+
def _calculate_lineage_tables(
|
|
133
|
+
self, fan_out: int, hops: int, fan_out_after_first: Optional[int] = None
|
|
134
|
+
) -> Tuple[int, List[int]]:
|
|
135
|
+
"""
|
|
136
|
+
Calculate the total number of tables and tables at each level for lineage generation.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
fan_out: Number of downstream tables per upstream table at level 1
|
|
140
|
+
hops: Number of hops (levels) in the lineage graph
|
|
141
|
+
fan_out_after_first: Optional limit on fanout for hops after the first hop
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Tuple of (total_tables, tables_at_levels) where tables_at_levels is a list
|
|
145
|
+
containing the number of tables at each level (index 0 = level 0, etc.)
|
|
146
|
+
"""
|
|
147
|
+
tables_to_be_created = 0
|
|
148
|
+
tables_at_levels: List[int] = []
|
|
149
|
+
|
|
150
|
+
for i in range(hops + 1):
|
|
151
|
+
if i == 0:
|
|
152
|
+
# Level 0: always 1 table
|
|
153
|
+
tables_at_level = 1
|
|
154
|
+
elif i == 1:
|
|
155
|
+
# Level 1: uses lineage_fan_out
|
|
156
|
+
tables_at_level = fan_out
|
|
157
|
+
else:
|
|
158
|
+
# Level 2+: use fan_out_after_first_hop if set, otherwise exponential growth
|
|
159
|
+
if fan_out_after_first is not None:
|
|
160
|
+
# Each table at previous level creates fan_out_after_first tables
|
|
161
|
+
tables_at_level = tables_at_levels[i - 1] * fan_out_after_first
|
|
162
|
+
else:
|
|
163
|
+
# Original exponential behavior
|
|
164
|
+
tables_at_level = fan_out**i
|
|
165
|
+
|
|
166
|
+
tables_at_levels.append(tables_at_level)
|
|
167
|
+
tables_to_be_created += tables_at_level
|
|
168
|
+
|
|
169
|
+
return tables_to_be_created, tables_at_levels
|
|
170
|
+
|
|
171
|
+
def _calculate_fanout_for_level(
|
|
172
|
+
self, level: int, fan_out: int, fan_out_after_first: Optional[int] = None
|
|
173
|
+
) -> int:
|
|
174
|
+
"""
|
|
175
|
+
Calculate the fanout (number of downstream tables) for a specific level.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
level: The current level (0-based)
|
|
179
|
+
fan_out: Number of downstream tables per upstream table at level 1
|
|
180
|
+
fan_out_after_first: Optional limit on fanout for hops after the first hop
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The number of downstream tables that each table at this level should connect to
|
|
184
|
+
"""
|
|
185
|
+
if level == 0:
|
|
186
|
+
# Level 0: uses the standard fan_out
|
|
187
|
+
return fan_out
|
|
188
|
+
else:
|
|
189
|
+
# Level 1+: use fan_out_after_first if set, otherwise use fan_out
|
|
190
|
+
return fan_out_after_first if fan_out_after_first is not None else fan_out
|
|
191
|
+
|
|
192
|
+
def _determine_subtype(
|
|
193
|
+
self, table_name: str, table_level: int, table_index: int
|
|
194
|
+
) -> str:
|
|
195
|
+
"""
|
|
196
|
+
Determine subtype based on configured pattern.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
table_name: Name of the table
|
|
200
|
+
table_level: Level of the table in the lineage graph
|
|
201
|
+
table_index: Index of the table within its level
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
The determined subtype ("Table" or "View")
|
|
205
|
+
"""
|
|
206
|
+
pattern = self.config.gen_1.subtype_pattern
|
|
207
|
+
|
|
208
|
+
if pattern == SubTypePattern.ALTERNATING:
|
|
209
|
+
return "Table" if table_index % 2 == 0 else "View"
|
|
210
|
+
elif pattern == SubTypePattern.LEVEL_BASED:
|
|
211
|
+
return self.config.gen_1.level_subtypes.get(table_level, "Table")
|
|
212
|
+
elif pattern == SubTypePattern.ALL_TABLE:
|
|
213
|
+
return "Table"
|
|
214
|
+
elif pattern == SubTypePattern.ALL_VIEW:
|
|
215
|
+
return "View"
|
|
216
|
+
else:
|
|
217
|
+
return "Table" # default
|
|
218
|
+
|
|
219
|
+
def _get_subtypes_aspect(
|
|
220
|
+
self, table_name: str, table_level: int, table_index: int
|
|
221
|
+
) -> MetadataWorkUnit:
|
|
222
|
+
"""
|
|
223
|
+
Create a SubTypes aspect for a table based on deterministic pattern.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
table_name: Name of the table
|
|
227
|
+
table_level: Level of the table in the lineage graph
|
|
228
|
+
table_index: Index of the table within its level
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
MetadataWorkUnit containing the SubTypes aspect
|
|
232
|
+
"""
|
|
233
|
+
# Determine subtype based on pattern
|
|
234
|
+
subtype = self._determine_subtype(table_name, table_level, table_index)
|
|
235
|
+
|
|
236
|
+
urn = make_dataset_urn(platform="fake", name=table_name)
|
|
237
|
+
mcp = MetadataChangeProposalWrapper(
|
|
238
|
+
entityUrn=urn,
|
|
239
|
+
entityType="dataset",
|
|
240
|
+
aspect=SubTypesClass(typeNames=[subtype]),
|
|
241
|
+
)
|
|
242
|
+
return mcp.as_workunit()
|
|
243
|
+
|
|
244
|
+
def _data_gen_1(self) -> Iterable[MetadataWorkUnit]:
|
|
245
|
+
"""Generate mock lineage data for testing purposes."""
|
|
246
|
+
gen_1 = self.config.gen_1
|
|
247
|
+
fan_out = gen_1.lineage_fan_out
|
|
248
|
+
hops = gen_1.lineage_hops
|
|
249
|
+
fan_out_after_first = gen_1.lineage_fan_out_after_first_hop
|
|
250
|
+
|
|
251
|
+
logger.info(
|
|
252
|
+
f"Generating lineage data with fan_out={fan_out}, hops={hops}, fan_out_after_first={fan_out_after_first}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
tables_to_be_created, tables_at_levels = self._calculate_lineage_tables(
|
|
256
|
+
fan_out, hops, fan_out_after_first
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
logger.info(
|
|
260
|
+
f"About to create {tables_to_be_created} tables for lineage testing"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
current_progress = 0
|
|
264
|
+
for i in range(hops + 1):
|
|
265
|
+
tables_at_level = tables_at_levels[i]
|
|
266
|
+
|
|
267
|
+
for j in range(tables_at_level):
|
|
268
|
+
table_name = TableNamingHelper.generate_table_name(hops, fan_out, i, j)
|
|
269
|
+
|
|
270
|
+
yield self._get_status_aspect(table_name)
|
|
271
|
+
|
|
272
|
+
yield self._get_subtypes_aspect(table_name, i, j)
|
|
273
|
+
|
|
274
|
+
yield from self._generate_lineage_for_table(
|
|
275
|
+
table_name=table_name,
|
|
276
|
+
table_level=i,
|
|
277
|
+
table_index=j,
|
|
278
|
+
hops=hops,
|
|
279
|
+
fan_out=fan_out,
|
|
280
|
+
fan_out_after_first=fan_out_after_first,
|
|
281
|
+
tables_at_levels=tables_at_levels,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
current_progress += 1
|
|
285
|
+
if current_progress % 1000 == 0:
|
|
286
|
+
logger.info(
|
|
287
|
+
f"Progress: {current_progress}/{tables_to_be_created} tables processed"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def _generate_lineage_for_table(
|
|
291
|
+
self,
|
|
292
|
+
table_name: str,
|
|
293
|
+
table_level: int,
|
|
294
|
+
table_index: int,
|
|
295
|
+
hops: int,
|
|
296
|
+
fan_out: int,
|
|
297
|
+
fan_out_after_first: Optional[int],
|
|
298
|
+
tables_at_levels: List[int],
|
|
299
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
300
|
+
"""Generate lineage relationships for a specific table."""
|
|
301
|
+
# Only generate lineage if there are downstream levels
|
|
302
|
+
if table_level + 1 > hops:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
current_fan_out = self._calculate_fanout_for_level(
|
|
306
|
+
table_level, fan_out, fan_out_after_first
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
yield from self._generate_downstream_lineage(
|
|
310
|
+
upstream_table_name=table_name,
|
|
311
|
+
upstream_table_index=table_index,
|
|
312
|
+
upstream_table_level=table_level,
|
|
313
|
+
current_fan_out=current_fan_out,
|
|
314
|
+
hops=hops,
|
|
315
|
+
fan_out=fan_out,
|
|
316
|
+
tables_at_levels=tables_at_levels,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def _generate_downstream_lineage(
|
|
320
|
+
self,
|
|
321
|
+
upstream_table_name: str,
|
|
322
|
+
upstream_table_index: int,
|
|
323
|
+
upstream_table_level: int,
|
|
324
|
+
current_fan_out: int,
|
|
325
|
+
hops: int,
|
|
326
|
+
fan_out: int,
|
|
327
|
+
tables_at_levels: List[int],
|
|
328
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
329
|
+
"""Generate lineage relationships to downstream tables."""
|
|
330
|
+
downstream_level = upstream_table_level + 1
|
|
331
|
+
downstream_tables_count = tables_at_levels[downstream_level]
|
|
332
|
+
|
|
333
|
+
# Calculate range of downstream tables this upstream table connects to
|
|
334
|
+
start_downstream = upstream_table_index * current_fan_out
|
|
335
|
+
end_downstream = min(
|
|
336
|
+
(upstream_table_index + 1) * current_fan_out, downstream_tables_count
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
for downstream_index in range(start_downstream, end_downstream):
|
|
340
|
+
downstream_table_name = TableNamingHelper.generate_table_name(
|
|
341
|
+
hops, fan_out, downstream_level, downstream_index
|
|
342
|
+
)
|
|
343
|
+
yield self._get_upstream_aspect(
|
|
344
|
+
upstream_table=upstream_table_name,
|
|
345
|
+
downstream_table=downstream_table_name,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
|
|
349
|
+
urn = make_dataset_urn(
|
|
350
|
+
platform="fake",
|
|
351
|
+
name=table,
|
|
352
|
+
)
|
|
353
|
+
mcp = MetadataChangeProposalWrapper(
|
|
354
|
+
entityUrn=urn,
|
|
355
|
+
entityType="dataset",
|
|
356
|
+
aspect=StatusClass(removed=False),
|
|
357
|
+
)
|
|
358
|
+
return mcp.as_workunit()
|
|
359
|
+
|
|
360
|
+
def _get_upstream_aspect(
|
|
361
|
+
self, upstream_table: str, downstream_table: str
|
|
362
|
+
) -> MetadataWorkUnit:
|
|
363
|
+
mcp = MetadataChangeProposalWrapper(
|
|
364
|
+
entityUrn=make_dataset_urn(
|
|
365
|
+
platform="fake",
|
|
366
|
+
name=downstream_table,
|
|
367
|
+
),
|
|
368
|
+
entityType="dataset",
|
|
369
|
+
aspect=UpstreamLineageClass(
|
|
370
|
+
upstreams=[
|
|
371
|
+
UpstreamClass(
|
|
372
|
+
dataset=make_dataset_urn(
|
|
373
|
+
platform="fake",
|
|
374
|
+
name=upstream_table,
|
|
375
|
+
),
|
|
376
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
377
|
+
)
|
|
378
|
+
],
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
return mcp.as_workunit()
|
|
382
|
+
|
|
383
|
+
def get_report(self) -> SourceReport:
|
|
384
|
+
return self.report
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TableNamingHelper:
|
|
5
|
+
"""
|
|
6
|
+
Helper class for managing table naming conventions in mock data generation.
|
|
7
|
+
|
|
8
|
+
Table naming pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def generate_table_name(
|
|
13
|
+
lineage_hops: int, lineage_fan_out: int, level: int, table_index: int
|
|
14
|
+
) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Generate a table name following the standard naming convention.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
lineage_hops: Total number of hops in the lineage graph
|
|
20
|
+
lineage_fan_out: Number of downstream tables per upstream table
|
|
21
|
+
level: Level of the table in the lineage graph (0-based)
|
|
22
|
+
table_index: Index of the table within its level (0-based)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Table name following the pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
26
|
+
"""
|
|
27
|
+
return f"hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def parse_table_name(table_name: str) -> Dict[str, int]:
|
|
31
|
+
"""
|
|
32
|
+
Parse a table name to extract its components.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
table_name: Table name following the standard naming convention
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary containing parsed components:
|
|
39
|
+
- lineage_hops: Total number of hops in the lineage graph
|
|
40
|
+
- lineage_fan_out: Number of downstream tables per upstream table
|
|
41
|
+
- level: Level of the table in the lineage graph (0-based)
|
|
42
|
+
- table_index: Index of the table within its level (0-based)
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If the table name doesn't follow the expected pattern
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
# Expected pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
|
|
49
|
+
parts = table_name.split("_")
|
|
50
|
+
|
|
51
|
+
if (
|
|
52
|
+
len(parts) != 6
|
|
53
|
+
or parts[0] != "hops"
|
|
54
|
+
or parts[2] != "f"
|
|
55
|
+
or not parts[4].startswith("h")
|
|
56
|
+
or not parts[5].startswith("t")
|
|
57
|
+
):
|
|
58
|
+
raise ValueError(f"Invalid table name format: {table_name}")
|
|
59
|
+
|
|
60
|
+
lineage_hops = int(parts[1])
|
|
61
|
+
lineage_fan_out = int(parts[3]) # lineage_fan_out is at index 3
|
|
62
|
+
level = int(parts[4][1:]) # Remove 'h' prefix from parts[4]
|
|
63
|
+
table_index = int(parts[5][1:]) # Remove 't' prefix from parts[5]
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
"lineage_hops": lineage_hops,
|
|
67
|
+
"lineage_fan_out": lineage_fan_out,
|
|
68
|
+
"level": level,
|
|
69
|
+
"table_index": table_index,
|
|
70
|
+
}
|
|
71
|
+
except (ValueError, IndexError) as e:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Failed to parse table name '{table_name}': {str(e)}"
|
|
74
|
+
) from e
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def is_valid_table_name(table_name: str) -> bool:
|
|
78
|
+
"""
|
|
79
|
+
Check if a table name follows the expected naming convention.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
table_name: Table name to validate
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
True if the table name follows the expected pattern, False otherwise
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
TableNamingHelper.parse_table_name(table_name)
|
|
89
|
+
return True
|
|
90
|
+
except ValueError:
|
|
91
|
+
return False
|
|
@@ -69,7 +69,7 @@ class PresetConfig(SupersetConfig):
|
|
|
69
69
|
|
|
70
70
|
@platform_name("Preset")
|
|
71
71
|
@config_class(PresetConfig)
|
|
72
|
-
@support_status(SupportStatus.
|
|
72
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
73
73
|
@capability(
|
|
74
74
|
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
|
|
75
75
|
)
|
|
@@ -10,6 +10,7 @@ import humanfriendly
|
|
|
10
10
|
import pydantic
|
|
11
11
|
import redshift_connector
|
|
12
12
|
|
|
13
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
13
14
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
14
15
|
from datahub.emitter.mce_builder import (
|
|
15
16
|
make_data_platform_urn,
|
|
@@ -357,7 +358,23 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
357
358
|
).workunit_processor,
|
|
358
359
|
]
|
|
359
360
|
|
|
361
|
+
def _warn_deprecated_configs(self):
|
|
362
|
+
if (
|
|
363
|
+
self.config.match_fully_qualified_names is not None
|
|
364
|
+
and not self.config.match_fully_qualified_names
|
|
365
|
+
and self.config.schema_pattern is not None
|
|
366
|
+
and self.config.schema_pattern != AllowDenyPattern.allow_all()
|
|
367
|
+
):
|
|
368
|
+
self.report.report_warning(
|
|
369
|
+
message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
|
|
370
|
+
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
|
|
371
|
+
"The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
|
|
372
|
+
context="Config option deprecation warning",
|
|
373
|
+
title="Config option deprecation warning",
|
|
374
|
+
)
|
|
375
|
+
|
|
360
376
|
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
377
|
+
self._warn_deprecated_configs()
|
|
361
378
|
connection = self._try_get_redshift_connection(self.config)
|
|
362
379
|
|
|
363
380
|
if connection is None:
|
|
@@ -89,6 +89,7 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
|
|
|
89
89
|
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
90
90
|
)
|
|
91
91
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
92
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
92
93
|
@dataclasses.dataclass
|
|
93
94
|
class ClickHouseUsageSource(Source):
|
|
94
95
|
"""
|
|
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
17
17
|
from datahub.ingestion.api.decorators import (
|
|
18
|
+
SourceCapability,
|
|
18
19
|
SupportStatus,
|
|
20
|
+
capability,
|
|
19
21
|
config_class,
|
|
20
22
|
platform_name,
|
|
21
23
|
support_status,
|
|
@@ -112,6 +114,7 @@ class TrinoUsageReport(SourceReport):
|
|
|
112
114
|
@platform_name("Trino")
|
|
113
115
|
@config_class(TrinoUsageConfig)
|
|
114
116
|
@support_status(SupportStatus.CERTIFIED)
|
|
117
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
115
118
|
@dataclasses.dataclass
|
|
116
119
|
class TrinoUsageSource(Source):
|
|
117
120
|
"""
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -9508,6 +9508,7 @@
|
|
|
9508
9508
|
"QA": "Designates quality assurance fabrics",
|
|
9509
9509
|
"RVW": "Designates review fabrics",
|
|
9510
9510
|
"SANDBOX": "Designates sandbox fabrics",
|
|
9511
|
+
"SBX": "Alternative spelling for sandbox",
|
|
9511
9512
|
"SIT": "System Integration Testing",
|
|
9512
9513
|
"STG": "Designates staging fabrics",
|
|
9513
9514
|
"TEST": "Designates testing fabrics",
|
|
@@ -9531,6 +9532,7 @@
|
|
|
9531
9532
|
"PRD",
|
|
9532
9533
|
"TST",
|
|
9533
9534
|
"SIT",
|
|
9535
|
+
"SBX",
|
|
9534
9536
|
"SANDBOX"
|
|
9535
9537
|
],
|
|
9536
9538
|
"doc": "Fabric group type"
|
|
@@ -99,6 +99,7 @@
|
|
|
99
99
|
"QA": "Designates quality assurance fabrics",
|
|
100
100
|
"RVW": "Designates review fabrics",
|
|
101
101
|
"SANDBOX": "Designates sandbox fabrics",
|
|
102
|
+
"SBX": "Alternative spelling for sandbox",
|
|
102
103
|
"SIT": "System Integration Testing",
|
|
103
104
|
"STG": "Designates staging fabrics",
|
|
104
105
|
"TEST": "Designates testing fabrics",
|
|
@@ -122,6 +123,7 @@
|
|
|
122
123
|
"PRD",
|
|
123
124
|
"TST",
|
|
124
125
|
"SIT",
|
|
126
|
+
"SBX",
|
|
125
127
|
"SANDBOX"
|
|
126
128
|
],
|
|
127
129
|
"doc": "Fabric group type"
|
|
@@ -153,6 +153,7 @@
|
|
|
153
153
|
"QA": "Designates quality assurance fabrics",
|
|
154
154
|
"RVW": "Designates review fabrics",
|
|
155
155
|
"SANDBOX": "Designates sandbox fabrics",
|
|
156
|
+
"SBX": "Alternative spelling for sandbox",
|
|
156
157
|
"SIT": "System Integration Testing",
|
|
157
158
|
"STG": "Designates staging fabrics",
|
|
158
159
|
"TEST": "Designates testing fabrics",
|
|
@@ -176,6 +177,7 @@
|
|
|
176
177
|
"PRD",
|
|
177
178
|
"TST",
|
|
178
179
|
"SIT",
|
|
180
|
+
"SBX",
|
|
179
181
|
"SANDBOX"
|
|
180
182
|
],
|
|
181
183
|
"doc": "Fabric group type"
|
|
@@ -219,6 +219,7 @@
|
|
|
219
219
|
"QA": "Designates quality assurance fabrics",
|
|
220
220
|
"RVW": "Designates review fabrics",
|
|
221
221
|
"SANDBOX": "Designates sandbox fabrics",
|
|
222
|
+
"SBX": "Alternative spelling for sandbox",
|
|
222
223
|
"SIT": "System Integration Testing",
|
|
223
224
|
"STG": "Designates staging fabrics",
|
|
224
225
|
"TEST": "Designates testing fabrics",
|
|
@@ -242,6 +243,7 @@
|
|
|
242
243
|
"PRD",
|
|
243
244
|
"TST",
|
|
244
245
|
"SIT",
|
|
246
|
+
"SBX",
|
|
245
247
|
"SANDBOX"
|
|
246
248
|
],
|
|
247
249
|
"doc": "Fabric group type"
|
|
@@ -52,6 +52,7 @@
|
|
|
52
52
|
"QA": "Designates quality assurance fabrics",
|
|
53
53
|
"RVW": "Designates review fabrics",
|
|
54
54
|
"SANDBOX": "Designates sandbox fabrics",
|
|
55
|
+
"SBX": "Alternative spelling for sandbox",
|
|
55
56
|
"SIT": "System Integration Testing",
|
|
56
57
|
"STG": "Designates staging fabrics",
|
|
57
58
|
"TEST": "Designates testing fabrics",
|
|
@@ -75,6 +76,7 @@
|
|
|
75
76
|
"PRD",
|
|
76
77
|
"TST",
|
|
77
78
|
"SIT",
|
|
79
|
+
"SBX",
|
|
78
80
|
"SANDBOX"
|
|
79
81
|
],
|
|
80
82
|
"doc": "Fabric group type"
|
|
@@ -89,6 +89,7 @@
|
|
|
89
89
|
"QA": "Designates quality assurance fabrics",
|
|
90
90
|
"RVW": "Designates review fabrics",
|
|
91
91
|
"SANDBOX": "Designates sandbox fabrics",
|
|
92
|
+
"SBX": "Alternative spelling for sandbox",
|
|
92
93
|
"SIT": "System Integration Testing",
|
|
93
94
|
"STG": "Designates staging fabrics",
|
|
94
95
|
"TEST": "Designates testing fabrics",
|
|
@@ -112,6 +113,7 @@
|
|
|
112
113
|
"PRD",
|
|
113
114
|
"TST",
|
|
114
115
|
"SIT",
|
|
116
|
+
"SBX",
|
|
115
117
|
"SANDBOX"
|
|
116
118
|
],
|
|
117
119
|
"doc": "Fabric group type"
|
|
@@ -64,6 +64,7 @@
|
|
|
64
64
|
"QA": "Designates quality assurance fabrics",
|
|
65
65
|
"RVW": "Designates review fabrics",
|
|
66
66
|
"SANDBOX": "Designates sandbox fabrics",
|
|
67
|
+
"SBX": "Alternative spelling for sandbox",
|
|
67
68
|
"SIT": "System Integration Testing",
|
|
68
69
|
"STG": "Designates staging fabrics",
|
|
69
70
|
"TEST": "Designates testing fabrics",
|
|
@@ -87,6 +88,7 @@
|
|
|
87
88
|
"PRD",
|
|
88
89
|
"TST",
|
|
89
90
|
"SIT",
|
|
91
|
+
"SBX",
|
|
90
92
|
"SANDBOX"
|
|
91
93
|
],
|
|
92
94
|
"doc": "Fabric group type"
|