acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +4 -1
- datahub/ingestion/api/source_helpers.py +26 -1
- datahub/ingestion/graph/client.py +104 -0
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Iterable, List, Optional, Tuple
|
|
3
|
+
from typing import Iterable, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from datahub._codegen.aspect import (
|
|
6
6
|
_Aspect, # TODO: is there a better import than this one?
|
|
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
|
|
|
46
46
|
DashboardInfoClass,
|
|
47
47
|
DashboardUsageStatisticsClass,
|
|
48
48
|
DataPlatformInstanceClass,
|
|
49
|
+
EdgeClass,
|
|
49
50
|
GlobalTagsClass,
|
|
50
51
|
OwnerClass,
|
|
51
52
|
OwnershipClass,
|
|
@@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
|
|
|
53
54
|
TagAssociationClass,
|
|
54
55
|
TimeWindowSizeClass,
|
|
55
56
|
)
|
|
56
|
-
from datahub.metadata.urns import
|
|
57
|
+
from datahub.metadata.urns import (
|
|
58
|
+
ContainerUrn,
|
|
59
|
+
CorpUserUrn,
|
|
60
|
+
DashboardUrn,
|
|
61
|
+
DatasetUrn,
|
|
62
|
+
SchemaFieldUrn,
|
|
63
|
+
Urn,
|
|
64
|
+
)
|
|
57
65
|
|
|
58
66
|
logger = logging.getLogger(__name__)
|
|
59
67
|
|
|
@@ -116,6 +124,8 @@ class Mapper:
|
|
|
116
124
|
),
|
|
117
125
|
externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
|
|
118
126
|
customProperties=dict(id=project.id),
|
|
127
|
+
datasetEdges=self._dataset_edges(project.upstream_datasets),
|
|
128
|
+
# TODO: support schema field upstream, maybe InputFields?
|
|
119
129
|
)
|
|
120
130
|
|
|
121
131
|
subtypes = SubTypesClass(
|
|
@@ -343,6 +353,22 @@ class Mapper:
|
|
|
343
353
|
else None,
|
|
344
354
|
)
|
|
345
355
|
|
|
356
|
+
def _dataset_edges(
|
|
357
|
+
self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
|
|
358
|
+
) -> Optional[List[EdgeClass]]:
|
|
359
|
+
# TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
|
|
360
|
+
return (
|
|
361
|
+
[
|
|
362
|
+
EdgeClass(
|
|
363
|
+
destinationUrn=upstream_urn.urn(),
|
|
364
|
+
)
|
|
365
|
+
for upstream_urn in upstream
|
|
366
|
+
if isinstance(upstream_urn, DatasetUrn)
|
|
367
|
+
]
|
|
368
|
+
if upstream
|
|
369
|
+
else None
|
|
370
|
+
)
|
|
371
|
+
|
|
346
372
|
def _yield_mcps(
|
|
347
373
|
self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
|
|
348
374
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
@dataclass
|
|
@@ -51,6 +53,12 @@ class Project:
|
|
|
51
53
|
creator: Optional[Owner] = None
|
|
52
54
|
owner: Optional[Owner] = None
|
|
53
55
|
analytics: Optional[Analytics] = None
|
|
56
|
+
upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
|
|
57
|
+
default_factory=list
|
|
58
|
+
)
|
|
59
|
+
upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
|
|
60
|
+
default_factory=list
|
|
61
|
+
)
|
|
54
62
|
|
|
55
63
|
|
|
56
64
|
@dataclass
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.api.source import SourceReport
|
|
8
|
+
from datahub.ingestion.source.hex.constants import (
|
|
9
|
+
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
10
|
+
HEX_PLATFORM_URN,
|
|
11
|
+
)
|
|
12
|
+
from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
|
|
13
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
|
|
14
|
+
from datahub.sdk.main_client import DataHubClient
|
|
15
|
+
from datahub.sdk.search_filters import FilterDsl as F
|
|
16
|
+
from datahub.utilities.time import datetime_to_ts_millis
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
+
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
|
|
22
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class QueryResponse:
|
|
27
|
+
"""This is the public response model for the HexQueryFetcher."""
|
|
28
|
+
|
|
29
|
+
urn: QueryUrn
|
|
30
|
+
hex_project_id: str
|
|
31
|
+
dataset_subjects: List[DatasetUrn] = field(default_factory=list)
|
|
32
|
+
schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class HexQueryFetcherReport(SourceReport):
|
|
37
|
+
start_datetime: Optional[datetime] = None
|
|
38
|
+
end_datetime: Optional[datetime] = None
|
|
39
|
+
fetched_query_urns: int = 0
|
|
40
|
+
fetched_query_objects: int = 0
|
|
41
|
+
filtered_out_queries_missing_metadata: int = 0
|
|
42
|
+
filtered_out_queries_different_workspace: int = 0
|
|
43
|
+
filtered_out_queries_no_match: int = 0
|
|
44
|
+
filtered_out_queries_no_subjects: int = 0
|
|
45
|
+
total_queries: int = 0
|
|
46
|
+
total_dataset_subjects: int = 0
|
|
47
|
+
total_schema_field_subjects: int = 0
|
|
48
|
+
num_calls_fetch_query_entities: int = 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HexQueryFetcher:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
datahub_client: DataHubClient,
|
|
55
|
+
workspace_name: str,
|
|
56
|
+
start_datetime: datetime,
|
|
57
|
+
end_datetime: datetime,
|
|
58
|
+
report: HexQueryFetcherReport,
|
|
59
|
+
page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
60
|
+
):
|
|
61
|
+
self.datahub_client = datahub_client
|
|
62
|
+
self.workspace_name = workspace_name
|
|
63
|
+
self.start_datetime = start_datetime
|
|
64
|
+
self.end_datetime = end_datetime
|
|
65
|
+
self.report = report
|
|
66
|
+
self.page_size = page_size
|
|
67
|
+
|
|
68
|
+
self.report.start_datetime = start_datetime
|
|
69
|
+
self.report.end_datetime = end_datetime
|
|
70
|
+
|
|
71
|
+
def fetch(self) -> Iterable[QueryResponse]:
|
|
72
|
+
try:
|
|
73
|
+
query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
|
|
74
|
+
assert all(isinstance(urn, QueryUrn) for urn in query_urns)
|
|
75
|
+
self.report.fetched_query_urns = len(query_urns)
|
|
76
|
+
|
|
77
|
+
entities_by_urn = self._fetch_query_entities(query_urns)
|
|
78
|
+
self.report.fetched_query_objects = len(entities_by_urn)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
self.report.failure(
|
|
81
|
+
title="Error fetching Queries for lineage",
|
|
82
|
+
message="Error fetching Queries will result on missing lineage",
|
|
83
|
+
context=str(
|
|
84
|
+
dict(
|
|
85
|
+
workspace_name=self.workspace_name,
|
|
86
|
+
start_datetime=self.start_datetime,
|
|
87
|
+
end_datetime=self.end_datetime,
|
|
88
|
+
)
|
|
89
|
+
),
|
|
90
|
+
exc=e,
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
if not query_urns or not entities_by_urn:
|
|
94
|
+
self.report.warning(
|
|
95
|
+
title="No Queries found with Hex as origin",
|
|
96
|
+
message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
|
|
97
|
+
context=str(
|
|
98
|
+
dict(
|
|
99
|
+
workspace_name=self.workspace_name,
|
|
100
|
+
start_datetime=self.start_datetime,
|
|
101
|
+
end_datetime=self.end_datetime,
|
|
102
|
+
)
|
|
103
|
+
),
|
|
104
|
+
)
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
for query_urn, (
|
|
108
|
+
query_properties,
|
|
109
|
+
query_subjects,
|
|
110
|
+
) in entities_by_urn.items():
|
|
111
|
+
maybe_query_response = self._build_query_response(
|
|
112
|
+
query_urn=query_urn,
|
|
113
|
+
query_properties=query_properties,
|
|
114
|
+
query_subjects=query_subjects,
|
|
115
|
+
)
|
|
116
|
+
if maybe_query_response:
|
|
117
|
+
yield maybe_query_response
|
|
118
|
+
|
|
119
|
+
def _fetch_query_entities(
|
|
120
|
+
self, query_urns: List[QueryUrn]
|
|
121
|
+
) -> Dict[
|
|
122
|
+
QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
|
|
123
|
+
]:
|
|
124
|
+
entities_by_urn: Dict[
|
|
125
|
+
QueryUrn,
|
|
126
|
+
Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
|
|
127
|
+
] = {}
|
|
128
|
+
for i in range(0, len(query_urns), self.page_size):
|
|
129
|
+
batch = query_urns[i : i + self.page_size]
|
|
130
|
+
|
|
131
|
+
logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
|
|
132
|
+
entities = self.datahub_client._graph.get_entities(
|
|
133
|
+
entity_name=QueryUrn.ENTITY_TYPE,
|
|
134
|
+
urns=[urn.urn() for urn in batch],
|
|
135
|
+
aspects=[
|
|
136
|
+
QueryPropertiesClass.ASPECT_NAME,
|
|
137
|
+
QuerySubjectsClass.ASPECT_NAME,
|
|
138
|
+
],
|
|
139
|
+
with_system_metadata=False,
|
|
140
|
+
)
|
|
141
|
+
self.report.num_calls_fetch_query_entities += 1
|
|
142
|
+
logger.debug(f"Get entities response: {entities}")
|
|
143
|
+
|
|
144
|
+
for urn, entity in entities.items():
|
|
145
|
+
query_urn = QueryUrn.from_string(urn)
|
|
146
|
+
|
|
147
|
+
properties_tuple = entity.get(
|
|
148
|
+
QueryPropertiesClass.ASPECT_NAME, (None, None)
|
|
149
|
+
)
|
|
150
|
+
query_properties: Optional[QueryPropertiesClass] = None
|
|
151
|
+
if properties_tuple and properties_tuple[0]:
|
|
152
|
+
assert isinstance(properties_tuple[0], QueryPropertiesClass)
|
|
153
|
+
query_properties = properties_tuple[0]
|
|
154
|
+
|
|
155
|
+
subjects_tuple = entity.get(
|
|
156
|
+
QuerySubjectsClass.ASPECT_NAME, (None, None)
|
|
157
|
+
)
|
|
158
|
+
query_subjects: Optional[QuerySubjectsClass] = None
|
|
159
|
+
if subjects_tuple and subjects_tuple[0]:
|
|
160
|
+
assert isinstance(subjects_tuple[0], QuerySubjectsClass)
|
|
161
|
+
query_subjects = subjects_tuple[0]
|
|
162
|
+
|
|
163
|
+
entities_by_urn[query_urn] = (query_properties, query_subjects)
|
|
164
|
+
|
|
165
|
+
return entities_by_urn
|
|
166
|
+
|
|
167
|
+
def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
|
|
168
|
+
last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
|
|
169
|
+
last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
|
|
170
|
+
|
|
171
|
+
urns = self.datahub_client.search.get_urns(
|
|
172
|
+
filter=F.and_(
|
|
173
|
+
F.entity_type(QueryUrn.ENTITY_TYPE),
|
|
174
|
+
F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
|
|
175
|
+
F.custom_filter(
|
|
176
|
+
"lastModifiedAt",
|
|
177
|
+
"GREATER_THAN_OR_EQUAL_TO",
|
|
178
|
+
[str(last_modified_start_at_millis)],
|
|
179
|
+
),
|
|
180
|
+
F.custom_filter(
|
|
181
|
+
"lastModifiedAt",
|
|
182
|
+
"LESS_THAN_OR_EQUAL_TO",
|
|
183
|
+
[str(last_modified_end_at_millis)],
|
|
184
|
+
),
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
logger.debug(f"Get URNS by filter: {urns}")
|
|
188
|
+
return [QueryUrn.from_string(urn.urn()) for urn in urns]
|
|
189
|
+
|
|
190
|
+
def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
|
|
191
|
+
"""
|
|
192
|
+
Extract project ID and workspace name from SQL statement.
|
|
193
|
+
|
|
194
|
+
Looks for Hex metadata in SQL comments in the format:
|
|
195
|
+
-- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
|
|
196
|
+
|
|
197
|
+
Example:
|
|
198
|
+
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
|
|
199
|
+
|
|
200
|
+
# TODO: Consider supporting multiline metadata format in the future:
|
|
201
|
+
# -- Hex query metadata: {
|
|
202
|
+
# -- "categories": ["Scratchpad"],
|
|
203
|
+
# -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
|
|
204
|
+
# -- ...
|
|
205
|
+
# -- }
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
A tuple of (project_id, workspace_name) if both are successfully extracted
|
|
209
|
+
None if extraction fails for any reason
|
|
210
|
+
"""
|
|
211
|
+
# Extract both project_id and workspace name in a single regex operation
|
|
212
|
+
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
|
213
|
+
|
|
214
|
+
if not match:
|
|
215
|
+
self.report.filtered_out_queries_no_match += 1
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
project_id = match.group(1)
|
|
220
|
+
workspace_name = match.group(2)
|
|
221
|
+
return project_id, workspace_name
|
|
222
|
+
except (IndexError, AttributeError) as e:
|
|
223
|
+
self.report.warning(
|
|
224
|
+
title="Failed to extract information from Hex query metadata",
|
|
225
|
+
message="Failed to extract information from Hex query metadata will result on missing lineage",
|
|
226
|
+
context=sql_statement,
|
|
227
|
+
exc=e,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
def _build_query_response(
|
|
233
|
+
self,
|
|
234
|
+
query_urn: QueryUrn,
|
|
235
|
+
query_properties: Optional[QueryPropertiesClass],
|
|
236
|
+
query_subjects: Optional[QuerySubjectsClass],
|
|
237
|
+
) -> Optional[QueryResponse]:
|
|
238
|
+
# Skip if missing required aspects
|
|
239
|
+
if (
|
|
240
|
+
not query_properties
|
|
241
|
+
or not query_properties.statement
|
|
242
|
+
or not query_properties.statement.value
|
|
243
|
+
or not query_subjects
|
|
244
|
+
or query_subjects.subjects is None # empty list is allowed
|
|
245
|
+
):
|
|
246
|
+
logger.debug(
|
|
247
|
+
f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
|
|
248
|
+
)
|
|
249
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
# Extract hex metadata (project_id and workspace_name)
|
|
253
|
+
metadata_result = self._extract_hex_metadata(query_properties.statement.value)
|
|
254
|
+
if not metadata_result:
|
|
255
|
+
logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
|
|
256
|
+
self.report.filtered_out_queries_missing_metadata += 1
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
hex_project_id, workspace_from_url = metadata_result
|
|
260
|
+
|
|
261
|
+
# Validate workspace
|
|
262
|
+
if workspace_from_url != self.workspace_name:
|
|
263
|
+
logger.debug(
|
|
264
|
+
f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
|
|
265
|
+
)
|
|
266
|
+
self.report.filtered_out_queries_different_workspace += 1
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
# Extract subjects
|
|
270
|
+
dataset_subjects: List[DatasetUrn] = []
|
|
271
|
+
schema_field_subjects: List[SchemaFieldUrn] = []
|
|
272
|
+
for subject in query_subjects.subjects:
|
|
273
|
+
if subject.entity and subject.entity.startswith("urn:li:dataset:"):
|
|
274
|
+
dataset_subjects.append(DatasetUrn.from_string(subject.entity))
|
|
275
|
+
elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
|
|
276
|
+
schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
|
|
277
|
+
|
|
278
|
+
if not dataset_subjects and not schema_field_subjects:
|
|
279
|
+
self.report.filtered_out_queries_no_subjects += 1
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
# Create response
|
|
283
|
+
response = QueryResponse(
|
|
284
|
+
urn=query_urn,
|
|
285
|
+
hex_project_id=hex_project_id,
|
|
286
|
+
dataset_subjects=dataset_subjects,
|
|
287
|
+
schema_field_subjects=schema_field_subjects,
|
|
288
|
+
)
|
|
289
|
+
logger.debug(
|
|
290
|
+
f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
|
|
291
|
+
)
|
|
292
|
+
self.report.total_queries += 1
|
|
293
|
+
self.report.total_dataset_subjects += len(dataset_subjects)
|
|
294
|
+
self.report.total_schema_field_subjects += len(schema_field_subjects)
|
|
295
|
+
|
|
296
|
+
logger.debug(
|
|
297
|
+
f"Processed query {query_urn} with Hex project ID {hex_project_id}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return response
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
+
from functools import partial
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from dateutil import parser as dateutil_parser
|
|
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
|
|
|
47
48
|
)
|
|
48
49
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
49
50
|
from datahub.emitter.mcp_builder import NamespaceKey
|
|
51
|
+
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
52
|
+
auto_patch_last_modified,
|
|
53
|
+
)
|
|
54
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
55
|
+
EnsureAspectSizeProcessor,
|
|
56
|
+
)
|
|
50
57
|
from datahub.ingestion.api.common import PipelineContext
|
|
51
58
|
from datahub.ingestion.api.decorators import (
|
|
52
59
|
SourceCapability,
|
|
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
57
64
|
support_status,
|
|
58
65
|
)
|
|
59
66
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
67
|
+
from datahub.ingestion.api.source_helpers import (
|
|
68
|
+
AutoSystemMetadata,
|
|
69
|
+
auto_fix_duplicate_schema_field_paths,
|
|
70
|
+
auto_fix_empty_field_paths,
|
|
71
|
+
auto_lowercase_urns,
|
|
72
|
+
auto_materialize_referenced_tags_terms,
|
|
73
|
+
auto_workunit_reporter,
|
|
74
|
+
)
|
|
60
75
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
61
76
|
from datahub.ingestion.extractor import schema_util
|
|
62
77
|
from datahub.ingestion.source.common.subtypes import (
|
|
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
82
97
|
SchemaMetadata,
|
|
83
98
|
)
|
|
84
99
|
from datahub.metadata.schema_classes import (
|
|
100
|
+
BrowsePathEntryClass,
|
|
101
|
+
BrowsePathsV2Class,
|
|
85
102
|
ContainerClass,
|
|
86
103
|
DataPlatformInstanceClass,
|
|
87
104
|
DatasetPropertiesClass,
|
|
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
134
151
|
super().__init__(config, ctx)
|
|
135
152
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
136
153
|
self.config: IcebergSourceConfig = config
|
|
154
|
+
self.ctx: PipelineContext = ctx
|
|
137
155
|
|
|
138
156
|
@classmethod
|
|
139
157
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
|
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
141
159
|
return cls(config, ctx)
|
|
142
160
|
|
|
143
161
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
162
|
+
# This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
|
|
163
|
+
# of parallelism. Because of this, 2 processors won't work as expected:
|
|
164
|
+
# 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
|
|
165
|
+
# in this source
|
|
166
|
+
# 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
|
|
167
|
+
# would have been applied in a thread (single) shared between the source, processors and transformers.
|
|
168
|
+
# Since the metadata scraping happens in separate threads, this could lead to difference between
|
|
169
|
+
# time used by systemMetadata and actual time at which metadata was read
|
|
170
|
+
auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
|
|
171
|
+
if (
|
|
172
|
+
self.ctx.pipeline_config
|
|
173
|
+
and self.ctx.pipeline_config.source
|
|
174
|
+
and self.ctx.pipeline_config.source.config
|
|
175
|
+
and (
|
|
176
|
+
(
|
|
177
|
+
hasattr(
|
|
178
|
+
self.ctx.pipeline_config.source.config,
|
|
179
|
+
"convert_urns_to_lowercase",
|
|
180
|
+
)
|
|
181
|
+
and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
|
|
182
|
+
)
|
|
183
|
+
or (
|
|
184
|
+
hasattr(self.ctx.pipeline_config.source.config, "get")
|
|
185
|
+
and self.ctx.pipeline_config.source.config.get(
|
|
186
|
+
"convert_urns_to_lowercase"
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
):
|
|
191
|
+
auto_lowercase_dataset_urns = auto_lowercase_urns
|
|
192
|
+
|
|
144
193
|
return [
|
|
145
|
-
|
|
194
|
+
auto_lowercase_dataset_urns,
|
|
195
|
+
auto_materialize_referenced_tags_terms,
|
|
196
|
+
partial(
|
|
197
|
+
auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
|
|
198
|
+
),
|
|
199
|
+
partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
|
|
200
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
201
|
+
auto_patch_last_modified,
|
|
202
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
146
203
|
StaleEntityRemovalHandler.create(
|
|
147
204
|
self, self.config, self.ctx
|
|
148
205
|
).workunit_processor,
|
|
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
208
265
|
)
|
|
209
266
|
thread_local.local_catalog = self.config.get_catalog()
|
|
210
267
|
|
|
268
|
+
if not hasattr(thread_local, "stamping_processor"):
|
|
269
|
+
LOGGER.debug(
|
|
270
|
+
f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
|
|
271
|
+
)
|
|
272
|
+
thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
|
|
273
|
+
|
|
211
274
|
with PerfTimer() as timer:
|
|
212
275
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
213
276
|
time_taken = timer.elapsed_seconds()
|
|
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
224
287
|
for aspect in self._create_iceberg_table_aspects(
|
|
225
288
|
dataset_name, table, namespace_urn
|
|
226
289
|
):
|
|
227
|
-
yield
|
|
228
|
-
|
|
229
|
-
|
|
290
|
+
yield thread_local.stamping_processor.stamp_wu(
|
|
291
|
+
MetadataChangeProposalWrapper(
|
|
292
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
293
|
+
).as_workunit()
|
|
294
|
+
)
|
|
230
295
|
except NoSuchPropertyException as e:
|
|
231
296
|
self.report.warning(
|
|
232
297
|
title="Unable to process table",
|
|
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
308
373
|
return
|
|
309
374
|
|
|
310
375
|
try:
|
|
376
|
+
stamping_processor = AutoSystemMetadata(self.ctx)
|
|
311
377
|
namespace_ids = self._get_namespaces(catalog)
|
|
312
378
|
namespaces: List[Tuple[Identifier, str]] = []
|
|
313
379
|
for namespace in namespace_ids:
|
|
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
323
389
|
)
|
|
324
390
|
namespaces.append((namespace, namespace_urn))
|
|
325
391
|
for aspect in self._create_iceberg_namespace_aspects(namespace):
|
|
326
|
-
yield
|
|
327
|
-
|
|
328
|
-
|
|
392
|
+
yield stamping_processor.stamp_wu(
|
|
393
|
+
MetadataChangeProposalWrapper(
|
|
394
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
395
|
+
).as_workunit()
|
|
396
|
+
)
|
|
329
397
|
LOGGER.debug("Namespaces ingestion completed")
|
|
330
398
|
except Exception as e:
|
|
331
399
|
self.report.report_failure(
|
|
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
366
434
|
yield dataset_ownership
|
|
367
435
|
|
|
368
436
|
yield self._create_schema_metadata(dataset_name, table)
|
|
369
|
-
|
|
437
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
438
|
+
yield dpi
|
|
439
|
+
yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
|
|
370
440
|
yield ContainerClass(container=str(namespace_urn))
|
|
371
441
|
|
|
372
442
|
self.report.report_table_processing_time(
|
|
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
377
447
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
378
448
|
yield from profiler.profile_table(dataset_name, table)
|
|
379
449
|
|
|
450
|
+
def _create_browse_paths_aspect(
|
|
451
|
+
self,
|
|
452
|
+
platform_instance_urn: Optional[str] = None,
|
|
453
|
+
container_urn: Optional[str] = None,
|
|
454
|
+
) -> BrowsePathsV2Class:
|
|
455
|
+
path = []
|
|
456
|
+
if platform_instance_urn:
|
|
457
|
+
path.append(
|
|
458
|
+
BrowsePathEntryClass(
|
|
459
|
+
id=platform_instance_urn, urn=platform_instance_urn
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
if container_urn:
|
|
463
|
+
path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
|
|
464
|
+
return BrowsePathsV2Class(path=path)
|
|
465
|
+
|
|
380
466
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
381
467
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
382
468
|
Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
|
|
@@ -425,23 +511,21 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
425
511
|
def _get_dataset_properties_aspect(
|
|
426
512
|
self, dataset_name: str, table: Table
|
|
427
513
|
) -> DatasetPropertiesClass:
|
|
428
|
-
|
|
514
|
+
created: Optional[TimeStampClass] = None
|
|
429
515
|
custom_properties = table.metadata.properties.copy()
|
|
430
516
|
custom_properties["location"] = table.metadata.location
|
|
431
517
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
432
518
|
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
519
|
+
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
433
520
|
if table.current_snapshot():
|
|
434
521
|
custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
|
|
435
522
|
custom_properties["manifest-list"] = table.current_snapshot().manifest_list
|
|
436
|
-
|
|
437
|
-
int(table.current_snapshot().timestamp_ms)
|
|
438
|
-
)
|
|
523
|
+
if not last_modified:
|
|
524
|
+
last_modified = int(table.current_snapshot().timestamp_ms)
|
|
439
525
|
if "created-at" in custom_properties:
|
|
440
526
|
try:
|
|
441
527
|
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
442
|
-
|
|
443
|
-
int(dt.timestamp() * 1000)
|
|
444
|
-
)
|
|
528
|
+
created = TimeStampClass(int(dt.timestamp() * 1000))
|
|
445
529
|
except Exception as ex:
|
|
446
530
|
LOGGER.warning(
|
|
447
531
|
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
@@ -451,8 +535,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
451
535
|
name=table.name()[-1],
|
|
452
536
|
description=table.metadata.properties.get("comment", None),
|
|
453
537
|
customProperties=custom_properties,
|
|
454
|
-
lastModified=
|
|
455
|
-
|
|
538
|
+
lastModified=TimeStampClass(last_modified)
|
|
539
|
+
if last_modified is not None
|
|
540
|
+
else None,
|
|
541
|
+
created=created,
|
|
456
542
|
qualifiedName=dataset_name,
|
|
457
543
|
)
|
|
458
544
|
|
|
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
530
616
|
name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
|
|
531
617
|
)
|
|
532
618
|
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
533
|
-
|
|
619
|
+
dpi = self._get_dataplatform_instance_aspect()
|
|
620
|
+
yield dpi
|
|
621
|
+
yield self._create_browse_paths_aspect(dpi.instance)
|
|
534
622
|
|
|
535
623
|
|
|
536
624
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
568
568
|
|
|
569
569
|
for config_key in KafkaTopicConfigKeys:
|
|
570
570
|
try:
|
|
571
|
-
if
|
|
572
|
-
config_key in topic_config.keys()
|
|
573
|
-
and topic_config[config_key] is not None
|
|
574
|
-
):
|
|
571
|
+
if config_key in topic_config and topic_config[config_key] is not None:
|
|
575
572
|
config_value = topic_config[config_key].value
|
|
576
573
|
custom_props[config_key] = (
|
|
577
574
|
config_value
|
|
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
197
197
|
for name in transform_names:
|
|
198
198
|
transform = {"name": name}
|
|
199
199
|
transforms.append(transform)
|
|
200
|
-
for key in self.connector_manifest.config
|
|
200
|
+
for key in self.connector_manifest.config:
|
|
201
201
|
if key.startswith(f"transforms.{name}."):
|
|
202
202
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
203
203
|
self.connector_manifest.config[key]
|
|
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
121
121
|
for name in transform_names:
|
|
122
122
|
transform = {"name": name}
|
|
123
123
|
transforms.append(transform)
|
|
124
|
-
for key in self.connector_manifest.config
|
|
124
|
+
for key in self.connector_manifest.config:
|
|
125
125
|
if key.startswith(f"transforms.{name}."):
|
|
126
126
|
transform[key.replace(f"transforms.{name}.", "")] = (
|
|
127
127
|
self.connector_manifest.config[key]
|
|
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
363
363
|
filters: MutableMapping[str, Any] = (
|
|
364
364
|
query.filters if query.filters is not None else {}
|
|
365
365
|
)
|
|
366
|
-
for field in filters
|
|
366
|
+
for field in filters:
|
|
367
367
|
if field is None:
|
|
368
368
|
continue
|
|
369
369
|
|
|
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
877
877
|
# fine to set them to None.
|
|
878
878
|
# TODO: Track project names for each explore.
|
|
879
879
|
explores_to_fetch = [
|
|
880
|
-
(None, model, explore)
|
|
881
|
-
for (model, explore) in self.reachable_explores.keys()
|
|
880
|
+
(None, model, explore) for (model, explore) in self.reachable_explores
|
|
882
881
|
]
|
|
883
882
|
explores_to_fetch.sort()
|
|
884
883
|
|