acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/METADATA +2575 -2575
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/RECORD +52 -45
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +21 -4
- datahub/ingestion/api/decorators.py +14 -3
- datahub/ingestion/api/report.py +123 -2
- datahub/ingestion/api/source.py +45 -44
- datahub/ingestion/autogenerated/lineage_helper.py +193 -0
- datahub/ingestion/graph/client.py +71 -28
- datahub/ingestion/run/pipeline.py +6 -0
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +4 -4
- datahub/ingestion/source/common/subtypes.py +43 -0
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
- datahub/ingestion/source/sql/athena.py +15 -3
- datahub/ingestion/source/sql/mssql/source.py +9 -0
- datahub/ingestion/source/sql/sql_common.py +3 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -1
- datahub/ingestion/source/sql/vertica.py +9 -1
- datahub/ingestion/source/tableau/tableau.py +6 -1
- datahub/ingestion/source/unity/source.py +36 -20
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/metadata/_internal_schema_classes.py +601 -0
- datahub/metadata/_urns/urn_defs.py +112 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +383 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
- datahub/sdk/datajob.py +39 -15
- datahub/specific/dataproduct.py +4 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/top_level.txt +0 -0
datahub/ingestion/api/source.py
CHANGED
|
@@ -2,7 +2,6 @@ import contextlib
|
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from enum import Enum
|
|
8
7
|
from functools import partial
|
|
@@ -15,7 +14,6 @@ from typing import (
|
|
|
15
14
|
List,
|
|
16
15
|
Optional,
|
|
17
16
|
Sequence,
|
|
18
|
-
Set,
|
|
19
17
|
Type,
|
|
20
18
|
TypeVar,
|
|
21
19
|
Union,
|
|
@@ -28,7 +26,6 @@ from typing_extensions import LiteralString, Self
|
|
|
28
26
|
from datahub.configuration.common import ConfigModel
|
|
29
27
|
from datahub.configuration.source_common import PlatformInstanceConfigMixin
|
|
30
28
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
31
|
-
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
32
29
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
33
30
|
auto_patch_last_modified,
|
|
34
31
|
)
|
|
@@ -37,7 +34,7 @@ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
|
37
34
|
)
|
|
38
35
|
from datahub.ingestion.api.closeable import Closeable
|
|
39
36
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
40
|
-
from datahub.ingestion.api.report import Report
|
|
37
|
+
from datahub.ingestion.api.report import ExamplesReport, Report
|
|
41
38
|
from datahub.ingestion.api.source_helpers import (
|
|
42
39
|
AutoSystemMetadata,
|
|
43
40
|
auto_browse_path_v2,
|
|
@@ -50,9 +47,8 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
50
47
|
auto_workunit_reporter,
|
|
51
48
|
)
|
|
52
49
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
53
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
54
|
-
from datahub.metadata.schema_classes import UpstreamLineageClass
|
|
55
50
|
from datahub.sdk.entity import Entity
|
|
51
|
+
from datahub.telemetry import stats
|
|
56
52
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
57
53
|
from datahub.utilities.type_annotations import get_class_from_annotation
|
|
58
54
|
|
|
@@ -191,20 +187,11 @@ class StructuredLogs(Report):
|
|
|
191
187
|
|
|
192
188
|
|
|
193
189
|
@dataclass
|
|
194
|
-
class SourceReport(
|
|
190
|
+
class SourceReport(ExamplesReport):
|
|
195
191
|
event_not_produced_warn: bool = True
|
|
196
192
|
events_produced: int = 0
|
|
197
193
|
events_produced_per_sec: int = 0
|
|
198
194
|
|
|
199
|
-
_urns_seen: Set[str] = field(default_factory=set)
|
|
200
|
-
entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
|
|
201
|
-
aspects: Dict[str, Dict[str, int]] = field(
|
|
202
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
203
|
-
)
|
|
204
|
-
aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
|
|
205
|
-
default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
|
|
206
|
-
)
|
|
207
|
-
|
|
208
195
|
_structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
|
|
209
196
|
|
|
210
197
|
@property
|
|
@@ -221,34 +208,10 @@ class SourceReport(Report):
|
|
|
221
208
|
|
|
222
209
|
def report_workunit(self, wu: WorkUnit) -> None:
|
|
223
210
|
self.events_produced += 1
|
|
211
|
+
if not isinstance(wu, MetadataWorkUnit):
|
|
212
|
+
return
|
|
224
213
|
|
|
225
|
-
|
|
226
|
-
urn = wu.get_urn()
|
|
227
|
-
|
|
228
|
-
# Specialized entity reporting.
|
|
229
|
-
if not isinstance(wu.metadata, MetadataChangeEvent):
|
|
230
|
-
mcps = [wu.metadata]
|
|
231
|
-
else:
|
|
232
|
-
mcps = list(mcps_from_mce(wu.metadata))
|
|
233
|
-
|
|
234
|
-
for mcp in mcps:
|
|
235
|
-
entityType = mcp.entityType
|
|
236
|
-
aspectName = mcp.aspectName
|
|
237
|
-
|
|
238
|
-
if urn not in self._urns_seen:
|
|
239
|
-
self._urns_seen.add(urn)
|
|
240
|
-
self.entities[entityType].append(urn)
|
|
241
|
-
|
|
242
|
-
if aspectName is not None: # usually true
|
|
243
|
-
self.aspects[entityType][aspectName] += 1
|
|
244
|
-
self.aspect_urn_samples[entityType][aspectName].append(urn)
|
|
245
|
-
if isinstance(mcp.aspect, UpstreamLineageClass):
|
|
246
|
-
upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
|
|
247
|
-
if upstream_lineage.fineGrainedLineages:
|
|
248
|
-
self.aspect_urn_samples[entityType][
|
|
249
|
-
"fineGrainedLineages"
|
|
250
|
-
].append(urn)
|
|
251
|
-
self.aspects[entityType]["fineGrainedLineages"] += 1
|
|
214
|
+
super()._store_workunit_data(wu)
|
|
252
215
|
|
|
253
216
|
def report_warning(
|
|
254
217
|
self,
|
|
@@ -327,6 +290,7 @@ class SourceReport(Report):
|
|
|
327
290
|
)
|
|
328
291
|
|
|
329
292
|
def __post_init__(self) -> None:
|
|
293
|
+
super().__post_init__()
|
|
330
294
|
self.start_time = datetime.datetime.now()
|
|
331
295
|
self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
|
|
332
296
|
|
|
@@ -339,6 +303,43 @@ class SourceReport(Report):
|
|
|
339
303
|
"infos": Report.to_pure_python_obj(self.infos),
|
|
340
304
|
}
|
|
341
305
|
|
|
306
|
+
@staticmethod
|
|
307
|
+
def _discretize_dict_values(
|
|
308
|
+
nested_dict: Dict[str, Dict[str, int]],
|
|
309
|
+
) -> Dict[str, Dict[str, int]]:
|
|
310
|
+
"""Helper method to discretize values in a nested dictionary structure."""
|
|
311
|
+
result = {}
|
|
312
|
+
for outer_key, inner_dict in nested_dict.items():
|
|
313
|
+
discretized_dict: Dict[str, int] = {}
|
|
314
|
+
for inner_key, count in inner_dict.items():
|
|
315
|
+
discretized_dict[inner_key] = stats.discretize(count)
|
|
316
|
+
result[outer_key] = discretized_dict
|
|
317
|
+
return result
|
|
318
|
+
|
|
319
|
+
def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
|
|
320
|
+
"""Convert the nested defaultdict aspects to a regular dict for serialization."""
|
|
321
|
+
return self._discretize_dict_values(self.aspects)
|
|
322
|
+
|
|
323
|
+
def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
324
|
+
"""Get aspect counts grouped by entity type and subtype."""
|
|
325
|
+
return self._discretize_dict_values_nested(self.aspects_by_subtypes)
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _discretize_dict_values_nested(
|
|
329
|
+
nested_dict: Dict[str, Dict[str, Dict[str, int]]],
|
|
330
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
331
|
+
"""Helper method to discretize values in a nested dictionary structure with three levels."""
|
|
332
|
+
result = {}
|
|
333
|
+
for outer_key, middle_dict in nested_dict.items():
|
|
334
|
+
discretized_middle_dict: Dict[str, Dict[str, int]] = {}
|
|
335
|
+
for middle_key, inner_dict in middle_dict.items():
|
|
336
|
+
discretized_inner_dict: Dict[str, int] = {}
|
|
337
|
+
for inner_key, count in inner_dict.items():
|
|
338
|
+
discretized_inner_dict[inner_key] = stats.discretize(count)
|
|
339
|
+
discretized_middle_dict[middle_key] = discretized_inner_dict
|
|
340
|
+
result[outer_key] = discretized_middle_dict
|
|
341
|
+
return result
|
|
342
|
+
|
|
342
343
|
def compute_stats(self) -> None:
|
|
343
344
|
super().compute_stats()
|
|
344
345
|
|
|
@@ -505,7 +506,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
505
506
|
pass
|
|
506
507
|
|
|
507
508
|
def close(self) -> None:
|
|
508
|
-
|
|
509
|
+
self.get_report().close()
|
|
509
510
|
|
|
510
511
|
def _infer_platform(self) -> Optional[str]:
|
|
511
512
|
config = self.get_config()
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set
|
|
5
|
+
|
|
6
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# Global cache for lineage data to avoid repeated file reads
|
|
11
|
+
_lineage_data: Optional[Dict] = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_lineage_data() -> Dict:
|
|
15
|
+
"""
|
|
16
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
17
|
+
|
|
18
|
+
Load lineage data from the autogenerated lineage.json file.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Dict containing the lineage information
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
25
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
26
|
+
"""
|
|
27
|
+
global _lineage_data
|
|
28
|
+
|
|
29
|
+
if _lineage_data is not None:
|
|
30
|
+
return _lineage_data
|
|
31
|
+
|
|
32
|
+
# Get the path to lineage.json relative to this file
|
|
33
|
+
current_file = Path(__file__)
|
|
34
|
+
lineage_file = current_file.parent / "lineage.json"
|
|
35
|
+
|
|
36
|
+
if not lineage_file.exists():
|
|
37
|
+
raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
with open(lineage_file, "r") as f:
|
|
41
|
+
_lineage_data = json.load(f)
|
|
42
|
+
return _lineage_data
|
|
43
|
+
except json.JSONDecodeError as e:
|
|
44
|
+
raise json.JSONDecodeError(
|
|
45
|
+
f"Failed to parse lineage.json: {e}", e.doc, e.pos
|
|
46
|
+
) from e
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
50
|
+
"""
|
|
51
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
52
|
+
|
|
53
|
+
Get lineage fields for a specific entity type and aspect.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
57
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List of lineage field dictionaries, each containing:
|
|
61
|
+
- name: field name
|
|
62
|
+
- path: dot-notation path to the field
|
|
63
|
+
- isLineage: boolean indicating if it's lineage
|
|
64
|
+
- relationship: relationship information
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
68
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
69
|
+
"""
|
|
70
|
+
lineage_data = _load_lineage_data()
|
|
71
|
+
|
|
72
|
+
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
73
|
+
aspect_data = entity_data.get(aspect_name, {})
|
|
74
|
+
|
|
75
|
+
return aspect_data.get("fields", [])
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
81
|
+
|
|
82
|
+
Check if a specific field path is lineage-related.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
86
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
87
|
+
field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if the field is lineage-related, False otherwise
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
94
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
95
|
+
AssertionError: If URN doesn't start with 'urn:li:'
|
|
96
|
+
"""
|
|
97
|
+
entity_type = guess_entity_type(urn)
|
|
98
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
99
|
+
|
|
100
|
+
for field in lineage_fields:
|
|
101
|
+
if field.get("path") == field_path:
|
|
102
|
+
return field.get("isLineage", False)
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def has_lineage(urn: str, aspect: Any) -> bool:
|
|
108
|
+
"""
|
|
109
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
110
|
+
|
|
111
|
+
Check if an aspect has any lineage fields.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
115
|
+
aspect: The aspect object
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
True if the aspect has lineage fields, False otherwise
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
122
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
123
|
+
AssertionError: If URN doesn't start with 'urn:li:'
|
|
124
|
+
"""
|
|
125
|
+
entity_type = guess_entity_type(urn)
|
|
126
|
+
aspect_class = getattr(aspect, "__class__", None)
|
|
127
|
+
aspect_name = (
|
|
128
|
+
aspect_class.__name__ if aspect_class is not None else str(type(aspect))
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
132
|
+
return len(lineage_fields) > 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
|
|
136
|
+
"""
|
|
137
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
138
|
+
|
|
139
|
+
Check if an aspect has any lineage fields.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
143
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
True if the aspect has lineage fields, False otherwise
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
150
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
151
|
+
"""
|
|
152
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
153
|
+
return len(lineage_fields) > 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_all_lineage_aspects(entity_type: str) -> Set[str]:
|
|
157
|
+
"""
|
|
158
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
159
|
+
|
|
160
|
+
Get all aspects that have lineage fields for a given entity type.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Set of aspect names that have lineage fields
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
170
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
171
|
+
"""
|
|
172
|
+
lineage_data = _load_lineage_data()
|
|
173
|
+
|
|
174
|
+
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
175
|
+
lineage_aspects = set()
|
|
176
|
+
|
|
177
|
+
for aspect_name, aspect_data in entity_data.items():
|
|
178
|
+
if aspect_data.get("fields"):
|
|
179
|
+
lineage_aspects.add(aspect_name)
|
|
180
|
+
|
|
181
|
+
return lineage_aspects
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def clear_cache() -> None:
|
|
185
|
+
"""
|
|
186
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
187
|
+
|
|
188
|
+
Clear the internal cache of lineage data.
|
|
189
|
+
|
|
190
|
+
This is useful for testing or when the lineage.json file has been updated.
|
|
191
|
+
"""
|
|
192
|
+
global _lineage_data
|
|
193
|
+
_lineage_data = None
|
|
@@ -22,6 +22,7 @@ from typing import (
|
|
|
22
22
|
Union,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
+
import progressbar
|
|
25
26
|
from avro.schema import RecordSchema
|
|
26
27
|
from pydantic import BaseModel
|
|
27
28
|
from requests.models import HTTPError
|
|
@@ -504,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
504
505
|
"limit": limit,
|
|
505
506
|
"filter": filter,
|
|
506
507
|
}
|
|
507
|
-
end_point = f"{self.
|
|
508
|
+
end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
|
|
508
509
|
resp: Dict = self._post_generic(end_point, query_body)
|
|
509
510
|
|
|
510
511
|
values: Optional[List] = resp.get("value", {}).get("values")
|
|
@@ -524,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
524
525
|
def get_entity_raw(
|
|
525
526
|
self, entity_urn: str, aspects: Optional[List[str]] = None
|
|
526
527
|
) -> Dict:
|
|
527
|
-
endpoint: str = f"{self.
|
|
528
|
+
endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
|
|
528
529
|
if aspects is not None:
|
|
529
530
|
assert aspects, "if provided, aspects must be a non-empty list"
|
|
530
531
|
endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
|
|
@@ -654,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
654
655
|
|
|
655
656
|
@property
|
|
656
657
|
def _search_endpoint(self):
|
|
657
|
-
return f"{self.
|
|
658
|
+
return f"{self._gms_server}/entities?action=search"
|
|
658
659
|
|
|
659
660
|
@property
|
|
660
661
|
def _relationships_endpoint(self):
|
|
661
|
-
return f"{self.
|
|
662
|
+
return f"{self._gms_server}/openapi/relationships/v1/"
|
|
662
663
|
|
|
663
664
|
@property
|
|
664
665
|
def _aspect_count_endpoint(self):
|
|
665
|
-
return f"{self.
|
|
666
|
+
return f"{self._gms_server}/aspects?action=getCount"
|
|
666
667
|
|
|
667
668
|
def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
|
|
668
669
|
"""Retrieve a domain urn based on its name. Returns None if there is no match found"""
|
|
@@ -1209,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1209
1210
|
operation_name: Optional[str] = None,
|
|
1210
1211
|
format_exception: bool = True,
|
|
1211
1212
|
) -> Dict:
|
|
1212
|
-
url = f"{self.
|
|
1213
|
+
url = f"{self._gms_server}/api/graphql"
|
|
1213
1214
|
|
|
1214
1215
|
body: Dict = {
|
|
1215
1216
|
"query": query,
|
|
@@ -1434,40 +1435,82 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1434
1435
|
related_aspects = response.get("relatedAspects", [])
|
|
1435
1436
|
return reference_count, related_aspects
|
|
1436
1437
|
|
|
1438
|
+
def get_kafka_consumer_offsets(
|
|
1439
|
+
self,
|
|
1440
|
+
) -> dict:
|
|
1441
|
+
"""
|
|
1442
|
+
Get Kafka consumer offsets from the DataHub API.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
graph (DataHubGraph): The DataHub graph client
|
|
1446
|
+
|
|
1447
|
+
"""
|
|
1448
|
+
urls = {
|
|
1449
|
+
"mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
|
|
1450
|
+
"mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
|
|
1451
|
+
"mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
params = {"skipCache": "true", "detailed": "true"}
|
|
1455
|
+
results = {}
|
|
1456
|
+
for key, url in urls.items():
|
|
1457
|
+
response = self._get_generic(url=url, params=params)
|
|
1458
|
+
results[key] = response
|
|
1459
|
+
if "errors" in response:
|
|
1460
|
+
logger.error(f"Error: {response['errors']}")
|
|
1461
|
+
return results
|
|
1462
|
+
|
|
1463
|
+
def _restore_index_call(self, payload_obj: dict) -> None:
|
|
1464
|
+
result = self._post_generic(
|
|
1465
|
+
f"{self._gms_server}/operations?action=restoreIndices", payload_obj
|
|
1466
|
+
)
|
|
1467
|
+
logger.debug(f"Restore indices result: {result}")
|
|
1468
|
+
|
|
1437
1469
|
def restore_indices(
|
|
1438
1470
|
self,
|
|
1439
|
-
urn_pattern: str,
|
|
1471
|
+
urn_pattern: Optional[str] = None,
|
|
1440
1472
|
aspect: Optional[str] = None,
|
|
1441
1473
|
start: Optional[int] = None,
|
|
1442
1474
|
batch_size: Optional[int] = None,
|
|
1443
|
-
|
|
1475
|
+
file: Optional[str] = None,
|
|
1476
|
+
) -> None:
|
|
1444
1477
|
"""Restore the indices for a given urn or urn-like pattern.
|
|
1445
1478
|
|
|
1446
1479
|
Args:
|
|
1447
|
-
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
|
|
1480
|
+
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
|
|
1448
1481
|
aspect: Optional aspect string to restore indices for a specific aspect.
|
|
1449
|
-
start: Optional integer to decide which row number of sql store to restore from. Default: 0.
|
|
1450
|
-
batch_size: Optional integer to decide how many rows to restore. Default: 10.
|
|
1482
|
+
start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
|
|
1483
|
+
batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
|
|
1484
|
+
file: Optional file path to a file containing URNs to restore indices for.
|
|
1451
1485
|
|
|
1452
1486
|
Returns:
|
|
1453
1487
|
A string containing the result of the restore indices operation. This format is subject to change.
|
|
1454
1488
|
"""
|
|
1455
|
-
|
|
1456
|
-
|
|
1489
|
+
payload_obj = {}
|
|
1490
|
+
if file is not None:
|
|
1491
|
+
with open(file) as f:
|
|
1492
|
+
for urn in progressbar.progressbar(f.readlines()):
|
|
1493
|
+
urn = urn.strip()
|
|
1494
|
+
if "%" in urn:
|
|
1495
|
+
payload_obj["urnLike"] = urn
|
|
1496
|
+
else:
|
|
1497
|
+
payload_obj["urn"] = urn
|
|
1498
|
+
if aspect is not None:
|
|
1499
|
+
payload_obj["aspect"] = aspect
|
|
1500
|
+
self._restore_index_call(payload_obj)
|
|
1457
1501
|
else:
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
return result
|
|
1502
|
+
if urn_pattern is not None:
|
|
1503
|
+
if "%" in urn_pattern:
|
|
1504
|
+
payload_obj["urnLike"] = urn_pattern
|
|
1505
|
+
else:
|
|
1506
|
+
payload_obj["urn"] = urn_pattern
|
|
1507
|
+
if aspect is not None:
|
|
1508
|
+
payload_obj["aspect"] = aspect
|
|
1509
|
+
if start is not None:
|
|
1510
|
+
payload_obj["start"] = start
|
|
1511
|
+
if batch_size is not None:
|
|
1512
|
+
payload_obj["batchSize"] = batch_size
|
|
1513
|
+
self._restore_index_call(payload_obj)
|
|
1471
1514
|
|
|
1472
1515
|
@functools.lru_cache
|
|
1473
1516
|
def _make_schema_resolver(
|
|
@@ -1774,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1774
1817
|
"Accept": "application/json",
|
|
1775
1818
|
"Content-Type": "application/json",
|
|
1776
1819
|
}
|
|
1777
|
-
url = f"{self.
|
|
1820
|
+
url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
|
|
1778
1821
|
response = self._session.post(url, data=json.dumps(payload), headers=headers)
|
|
1779
1822
|
response.raise_for_status()
|
|
1780
1823
|
|
|
@@ -1831,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1831
1874
|
"Content-Type": "application/json",
|
|
1832
1875
|
}
|
|
1833
1876
|
|
|
1834
|
-
url = f"{self.
|
|
1877
|
+
url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
|
|
1835
1878
|
if with_system_metadata:
|
|
1836
1879
|
url += "?systemMetadata=true"
|
|
1837
1880
|
|
|
@@ -578,11 +578,17 @@ class Pipeline:
|
|
|
578
578
|
sink_failures = len(self.sink.get_report().failures)
|
|
579
579
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
580
580
|
global_warnings = len(get_global_warnings())
|
|
581
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
582
|
+
source_aspects_by_subtype = (
|
|
583
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
584
|
+
)
|
|
581
585
|
|
|
582
586
|
telemetry_instance.ping(
|
|
583
587
|
"ingest_stats",
|
|
584
588
|
{
|
|
585
589
|
"source_type": self.source_type,
|
|
590
|
+
"source_aspects": source_aspects,
|
|
591
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
586
592
|
"sink_type": self.sink_type,
|
|
587
593
|
"transformer_types": [
|
|
588
594
|
transformer.type for transformer in self.config.transformers or []
|
|
@@ -269,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
269
269
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
270
270
|
@capability(
|
|
271
271
|
SourceCapability.DELETION_DETECTION,
|
|
272
|
-
"Enabled by default
|
|
272
|
+
"Enabled by default via stateful ingestion.",
|
|
273
273
|
)
|
|
274
274
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
275
275
|
@capability(
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
49
|
-
ts.size_bytes as
|
|
48
|
+
ts.row_count as row_count,
|
|
49
|
+
ts.size_bytes as size_bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
-- IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
1
4
|
from datahub.utilities.str_enum import StrEnum
|
|
2
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
3
8
|
|
|
4
9
|
class DatasetSubTypes(StrEnum):
|
|
5
10
|
# Generic SubTypes
|
|
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
26
31
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
32
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
28
33
|
API_ENDPOINT = "API Endpoint"
|
|
34
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
35
|
+
PROJECTIONS = "Projections"
|
|
29
36
|
|
|
30
37
|
# TODO: Create separate entity...
|
|
31
38
|
NOTEBOOK = "Notebook"
|
|
@@ -74,6 +81,9 @@ class JobContainerSubTypes(StrEnum):
|
|
|
74
81
|
|
|
75
82
|
|
|
76
83
|
class BIAssetSubTypes(StrEnum):
|
|
84
|
+
DASHBOARD = "Dashboard"
|
|
85
|
+
CHART = "Chart"
|
|
86
|
+
|
|
77
87
|
# Generic SubTypes
|
|
78
88
|
REPORT = "Report"
|
|
79
89
|
|
|
@@ -116,3 +126,36 @@ class MLAssetSubTypes(StrEnum):
|
|
|
116
126
|
VERTEX_PIPELINE = "Pipeline Job"
|
|
117
127
|
VERTEX_PIPELINE_TASK = "Pipeline Task"
|
|
118
128
|
VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def create_source_capability_modifier_enum():
|
|
132
|
+
all_values: Dict[str, Any] = {}
|
|
133
|
+
source_enums = [
|
|
134
|
+
DatasetSubTypes,
|
|
135
|
+
DatasetContainerSubTypes,
|
|
136
|
+
BIContainerSubTypes,
|
|
137
|
+
FlowContainerSubTypes,
|
|
138
|
+
JobContainerSubTypes,
|
|
139
|
+
BIAssetSubTypes,
|
|
140
|
+
MLAssetSubTypes,
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
for enum_class in source_enums:
|
|
144
|
+
for member in enum_class: # type: ignore[var-annotated]
|
|
145
|
+
if member.name in all_values:
|
|
146
|
+
logger.error(
|
|
147
|
+
f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
|
|
148
|
+
)
|
|
149
|
+
continue
|
|
150
|
+
all_values[member.name] = member.value
|
|
151
|
+
|
|
152
|
+
enum_code = "class SourceCapabilityModifier(StrEnum):\n"
|
|
153
|
+
for name, value in all_values.items():
|
|
154
|
+
enum_code += f' {name} = "{value}"\n'
|
|
155
|
+
|
|
156
|
+
exec(enum_code, globals())
|
|
157
|
+
return globals()["SourceCapabilityModifier"]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# This will have all values from the enums above
|
|
161
|
+
SourceCapabilityModifier = create_source_capability_modifier_enum()
|
|
@@ -355,7 +355,7 @@ class DBTCommonConfig(
|
|
|
355
355
|
# override default value to True.
|
|
356
356
|
incremental_lineage: bool = Field(
|
|
357
357
|
default=True,
|
|
358
|
-
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
|
|
358
|
+
description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
|
|
359
359
|
)
|
|
360
360
|
|
|
361
361
|
_remove_use_compiled_code = pydantic_removed_field("use_compiled_code")
|