acryl-datahub 1.2.0.2rc3__py3-none-any.whl → 1.2.0.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/METADATA +2533 -2533
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/RECORD +15 -15
- datahub/_version.py +1 -1
- datahub/ingestion/source/sql/vertica.py +3 -0
- datahub/ingestion/source/sql_queries.py +86 -44
- datahub/ingestion/source/unity/proxy.py +112 -22
- datahub/ingestion/source/unity/source.py +7 -10
- datahub/metadata/schema.avsc +9 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +9 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.2rc3.dist-info → acryl_datahub-1.2.0.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.2.0.
|
|
1
|
+
acryl_datahub-1.2.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=BAFY2OaLzkEm6Hs8RwoQ69XvJQdk3iPPrc9oCg1xGAE,323
|
|
5
5
|
datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -227,7 +227,7 @@ datahub/ingestion/source/pulsar.py,sha256=u5F8QnCLJsht5-7XCiUTsnfhCPIpKVB_l32CgM
|
|
|
227
227
|
datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99WdvcYiA,30653
|
|
228
228
|
datahub/ingestion/source/salesforce.py,sha256=Pa_w1XszxFd8fyhpSWOfc2nOnevHwwstIvnRrQT4R9M,40584
|
|
229
229
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
230
|
-
datahub/ingestion/source/sql_queries.py,sha256=
|
|
230
|
+
datahub/ingestion/source/sql_queries.py,sha256=9ICUC6tpXpxvtwfK-9lytJzFcLe8MrNlciwy9DIFM-4,13764
|
|
231
231
|
datahub/ingestion/source/superset.py,sha256=oi7F2jlvkVr9ItJ_r1Jm4bYfXHYu4vPAFPMPaGJKB84,50608
|
|
232
232
|
datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
233
233
|
datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
|
|
@@ -524,7 +524,7 @@ datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVH
|
|
|
524
524
|
datahub/ingestion/source/sql/teradata.py,sha256=xL_c_UEM_JT-xoMw5Nb5UvSNBUfTGol5CpOkgK5Bsjk,65412
|
|
525
525
|
datahub/ingestion/source/sql/trino.py,sha256=zIfQ6GvW8Sbw4sxqsTcnibT51STka_nzNYvmld6HfHw,18947
|
|
526
526
|
datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=AB3Gtx4omAy_08zadHQpmUGmIGufkZ6o_ihWNnfvzYc,5783
|
|
527
|
-
datahub/ingestion/source/sql/vertica.py,sha256=
|
|
527
|
+
datahub/ingestion/source/sql/vertica.py,sha256=blnu1-H7vnSQD3ZD5QTotoQ2DQJWJeR0uxz_clxiPGo,33518
|
|
528
528
|
datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
|
|
529
529
|
datahub/ingestion/source/sql/mssql/job_models.py,sha256=nAo3rciu-w2-dXCz6_ekDEbGMEjCMEfh8WvSfXoF2l0,9359
|
|
530
530
|
datahub/ingestion/source/sql/mssql/source.py,sha256=Uise_u6yXKU__9B_U3D3yObWNIVDzrz2AgEDZOlk6bQ,43101
|
|
@@ -558,11 +558,11 @@ datahub/ingestion/source/unity/config.py,sha256=7QosoBthg9kirHfXev_vhefkobUxYnp1
|
|
|
558
558
|
datahub/ingestion/source/unity/connection_test.py,sha256=B143Wb28fS0V4GhygU9hzKqiArWBjsQO54IUCPf23dc,2586
|
|
559
559
|
datahub/ingestion/source/unity/ge_profiler.py,sha256=NBRHZceq-f95iUn7u0h7cgcd9nAc48Aa-lmp_BqE0As,8409
|
|
560
560
|
datahub/ingestion/source/unity/hive_metastore_proxy.py,sha256=IAWWJjaW0si_UF52Se2D7wmdYRY_afUG4QlVmQu6xaw,15351
|
|
561
|
-
datahub/ingestion/source/unity/proxy.py,sha256=
|
|
561
|
+
datahub/ingestion/source/unity/proxy.py,sha256=iZ2ftKOXkxpFr0_2bEYEm31ci9OZJWFYgna3DNLCXrQ,26706
|
|
562
562
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
563
563
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
564
564
|
datahub/ingestion/source/unity/report.py,sha256=XFT9oQfvEB4RkTvWGgFOoQuLPUN_AIoPXZ79xeDhGHQ,2831
|
|
565
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
565
|
+
datahub/ingestion/source/unity/source.py,sha256=udK1_WI7RO2Uzx9xis0mx264nHXDiMeP4Kah3-CFXis,49832
|
|
566
566
|
datahub/ingestion/source/unity/tag_entities.py,sha256=iWl6nRAWSye1hoFDx_Xh4aT53PN0sGzlX7n1-oTVUv8,11568
|
|
567
567
|
datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
|
|
568
568
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -627,7 +627,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
|
|
|
627
627
|
datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
|
|
628
628
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
629
629
|
datahub/metadata/_internal_schema_classes.py,sha256=Zh2volhvkUCENRavXDwPsmwfRe62k8_O6f5QT8_bh-g,1051205
|
|
630
|
-
datahub/metadata/schema.avsc,sha256=
|
|
630
|
+
datahub/metadata/schema.avsc,sha256=u5iUlz9AnFfJijjJ9xcZx3MoiBfxWSmhr7pZIOg0tMo,738363
|
|
631
631
|
datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
|
|
632
632
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
633
633
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
@@ -781,7 +781,7 @@ datahub/metadata/schemas/DataHubUpgradeResult.avsc,sha256=VydVb4yqjIviR73-T6TooF
|
|
|
781
781
|
datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdPQYwvhhv9RQv6gEeg,11642
|
|
782
782
|
datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
|
|
783
783
|
datahub/metadata/schemas/DataJobInfo.avsc,sha256=Bc9qdDcXI0GQdEgNTpgHaBbnrppDKQ-1xR26diOSVIQ,7488
|
|
784
|
-
datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=
|
|
784
|
+
datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=BYKImZ8kQQHqWbSBMKXWD0tGi96yzUt8zJFW3_twVVM,15575
|
|
785
785
|
datahub/metadata/schemas/DataJobKey.avsc,sha256=S7egH8jWjKW52MG6Pg7plDoP15XfTTiMde5V6nR6ycE,1624
|
|
786
786
|
datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
|
|
787
787
|
datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
|
|
@@ -884,7 +884,7 @@ datahub/metadata/schemas/MLModelProperties.avsc,sha256=hDCBHxGe-cmCBeU1k0ANuQlKj
|
|
|
884
884
|
datahub/metadata/schemas/MLPrimaryKeyKey.avsc,sha256=F3lgpMnHBhXsqGncHE9x06P-0RiNCrzbUUWlMkPJxFI,1132
|
|
885
885
|
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc,sha256=URIuOpS93RVk8MZVcbZ-dmTwu_cN3KSOKxSR8fm-eTo,6744
|
|
886
886
|
datahub/metadata/schemas/MLTrainingRunProperties.avsc,sha256=WGgj0MuQrGD4UgvyHCJHzTnHja2LlJTOr1gLu8SySj0,4269
|
|
887
|
-
datahub/metadata/schemas/MetadataChangeEvent.avsc,sha256=
|
|
887
|
+
datahub/metadata/schemas/MetadataChangeEvent.avsc,sha256=oNK0N8WrBsM_AoZkdYAMJQlhYzbao_QWaAMOjqEvPBw,378228
|
|
888
888
|
datahub/metadata/schemas/MetadataChangeLog.avsc,sha256=soCmgrcEBE5yS-mQIm-RIefhb74ONj9Fqayxa0-59KE,13254
|
|
889
889
|
datahub/metadata/schemas/MetadataChangeProposal.avsc,sha256=pT14vUmpj7VJ8hinQ0pcCUtRKx6RAGHWh1eJixkqaE8,12647
|
|
890
890
|
datahub/metadata/schemas/Metrics.avsc,sha256=O7DJGjOwmHbb1x_Zj7AuM_HaHKjBvkfJKfUsX8icXD4,690
|
|
@@ -933,7 +933,7 @@ datahub/metadata/schemas/TestInfo.avsc,sha256=rye90gdY_lxZt_1gpa_Xum923CJgDU6i_e
|
|
|
933
933
|
datahub/metadata/schemas/TestKey.avsc,sha256=eL-S4Z8EuN1JEXV1t4fy3LwmdA2dJURasFcKygP2rLY,421
|
|
934
934
|
datahub/metadata/schemas/TestResults.avsc,sha256=uspC95AzRvz2_AgHVb5-fxELm5u8NmBTaFVJvGunmh0,5178
|
|
935
935
|
datahub/metadata/schemas/TrainingData.avsc,sha256=7p7sFBA_UyV5IbNU5qLgS3vVu70yevKCfJKSGmTzVTg,2069
|
|
936
|
-
datahub/metadata/schemas/UpstreamLineage.avsc,sha256=
|
|
936
|
+
datahub/metadata/schemas/UpstreamLineage.avsc,sha256=dtpI7KUv9kYyGZmIlKfR2zLwgqsHO5P20egvIeup1EU,11000
|
|
937
937
|
datahub/metadata/schemas/UsageAggregation.avsc,sha256=QaF6lyWGUq8IlRel2h4qIXOXCMxBhrwjoaUELsd-I6g,4538
|
|
938
938
|
datahub/metadata/schemas/VersionInfo.avsc,sha256=9gMcZ8tjuhgcZiq2gOAp_EOV9q9jvuOgfph6m6v_X7c,1189
|
|
939
939
|
datahub/metadata/schemas/VersionProperties.avsc,sha256=ME8V01JzG8lEsLXgYWnSYCehmpPcvv1UbE5Y8-8Ys9k,8022
|
|
@@ -1101,8 +1101,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1101
1101
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1102
1102
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1103
1103
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1104
|
-
acryl_datahub-1.2.0.
|
|
1105
|
-
acryl_datahub-1.2.0.
|
|
1106
|
-
acryl_datahub-1.2.0.
|
|
1107
|
-
acryl_datahub-1.2.0.
|
|
1108
|
-
acryl_datahub-1.2.0.
|
|
1104
|
+
acryl_datahub-1.2.0.3rc1.dist-info/METADATA,sha256=zo0PhZMaumsiXe8Vq8ud1VQPSZWVspaAuUIx1FoCk9s,182014
|
|
1105
|
+
acryl_datahub-1.2.0.3rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
1106
|
+
acryl_datahub-1.2.0.3rc1.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
|
|
1107
|
+
acryl_datahub-1.2.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1108
|
+
acryl_datahub-1.2.0.3rc1.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
+
import pytest
|
|
7
8
|
from pydantic.class_validators import validator
|
|
8
9
|
from vertica_sqlalchemy_dialect.base import VerticaInspector
|
|
9
10
|
|
|
@@ -55,6 +56,8 @@ from datahub.utilities import config_clean
|
|
|
55
56
|
|
|
56
57
|
if TYPE_CHECKING:
|
|
57
58
|
from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
|
|
59
|
+
|
|
60
|
+
pytestmark = pytest.mark.integration_batch_4
|
|
58
61
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
59
62
|
|
|
60
63
|
|
|
@@ -2,12 +2,13 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Iterable, List, Optional, Union
|
|
7
|
+
from typing import ClassVar, Iterable, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from pydantic import Field
|
|
9
|
+
from pydantic import BaseModel, Field, validator
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.datetimes import parse_user_datetime
|
|
11
12
|
from datahub.configuration.source_common import (
|
|
12
13
|
EnvConfigMixin,
|
|
13
14
|
PlatformInstanceConfigMixin,
|
|
@@ -35,7 +36,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
35
36
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
36
37
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
37
38
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
38
|
-
from datahub.metadata.urns import CorpUserUrn
|
|
39
|
+
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
39
40
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
40
41
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
41
42
|
KnownQueryLineageInfo,
|
|
@@ -208,19 +209,40 @@ class SqlQueriesSource(Source):
|
|
|
208
209
|
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
209
210
|
"""Add a query to the SQL parsing aggregator."""
|
|
210
211
|
try:
|
|
211
|
-
# If we have
|
|
212
|
-
if query_entry.upstream_tables
|
|
212
|
+
# If we have both upstream and downstream tables, use explicit lineage
|
|
213
|
+
if query_entry.upstream_tables and query_entry.downstream_tables:
|
|
213
214
|
logger.debug("Using explicit lineage from query file")
|
|
214
215
|
for downstream_table in query_entry.downstream_tables:
|
|
215
216
|
known_lineage = KnownQueryLineageInfo(
|
|
216
217
|
query_text=query_entry.query,
|
|
217
|
-
downstream=downstream_table,
|
|
218
|
-
upstreams=query_entry.upstream_tables,
|
|
218
|
+
downstream=str(downstream_table),
|
|
219
|
+
upstreams=[str(urn) for urn in query_entry.upstream_tables],
|
|
219
220
|
timestamp=query_entry.timestamp,
|
|
220
221
|
session_id=query_entry.session_id,
|
|
221
222
|
)
|
|
222
223
|
self.aggregator.add_known_query_lineage(known_lineage)
|
|
223
224
|
else:
|
|
225
|
+
# Warn if only partial lineage information is provided
|
|
226
|
+
# XOR: true if exactly one of upstream_tables or downstream_tables is provided
|
|
227
|
+
if bool(query_entry.upstream_tables) ^ bool(
|
|
228
|
+
query_entry.downstream_tables
|
|
229
|
+
):
|
|
230
|
+
query_preview = (
|
|
231
|
+
query_entry.query[:150] + "..."
|
|
232
|
+
if len(query_entry.query) > 150
|
|
233
|
+
else query_entry.query
|
|
234
|
+
)
|
|
235
|
+
missing_upstream = (
|
|
236
|
+
"Missing upstream. " if not query_entry.upstream_tables else ""
|
|
237
|
+
)
|
|
238
|
+
missing_downstream = (
|
|
239
|
+
"Missing downstream. "
|
|
240
|
+
if not query_entry.downstream_tables
|
|
241
|
+
else ""
|
|
242
|
+
)
|
|
243
|
+
logger.info(
|
|
244
|
+
f"Only partial lineage information provided, falling back to SQL parsing for complete lineage detection. {missing_upstream}{missing_downstream}Query: {query_preview}"
|
|
245
|
+
)
|
|
224
246
|
# No explicit lineage, rely on parsing
|
|
225
247
|
observed_query = ObservedQuery(
|
|
226
248
|
query=query_entry.query,
|
|
@@ -243,46 +265,66 @@ class SqlQueriesSource(Source):
|
|
|
243
265
|
)
|
|
244
266
|
|
|
245
267
|
|
|
246
|
-
|
|
247
|
-
class QueryEntry:
|
|
268
|
+
class QueryEntry(BaseModel):
|
|
248
269
|
query: str
|
|
249
|
-
timestamp: Optional[datetime]
|
|
250
|
-
user: Optional[CorpUserUrn]
|
|
251
|
-
operation_type: Optional[str]
|
|
252
|
-
downstream_tables: List[
|
|
253
|
-
upstream_tables: List[
|
|
270
|
+
timestamp: Optional[datetime] = None
|
|
271
|
+
user: Optional[CorpUserUrn] = None
|
|
272
|
+
operation_type: Optional[str] = None
|
|
273
|
+
downstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
274
|
+
upstream_tables: List[DatasetUrn] = Field(default_factory=list)
|
|
254
275
|
session_id: Optional[str] = None
|
|
255
276
|
|
|
277
|
+
# Validation context for URN creation
|
|
278
|
+
_validation_context: ClassVar[Optional[SqlQueriesSourceConfig]] = None
|
|
279
|
+
|
|
280
|
+
class Config:
|
|
281
|
+
arbitrary_types_allowed = True
|
|
282
|
+
|
|
283
|
+
@validator("timestamp", pre=True)
|
|
284
|
+
def parse_timestamp(cls, v):
|
|
285
|
+
return None if v is None else parse_user_datetime(str(v))
|
|
286
|
+
|
|
287
|
+
@validator("user", pre=True)
|
|
288
|
+
def parse_user(cls, v):
|
|
289
|
+
if v is None:
|
|
290
|
+
return None
|
|
291
|
+
|
|
292
|
+
return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
|
|
293
|
+
|
|
294
|
+
@validator("downstream_tables", "upstream_tables", pre=True)
|
|
295
|
+
def parse_tables(cls, v):
|
|
296
|
+
if not v:
|
|
297
|
+
return []
|
|
298
|
+
|
|
299
|
+
result = []
|
|
300
|
+
for item in v:
|
|
301
|
+
if isinstance(item, DatasetUrn):
|
|
302
|
+
result.append(item)
|
|
303
|
+
elif isinstance(item, str):
|
|
304
|
+
# Skip empty/whitespace-only strings
|
|
305
|
+
if item and item.strip():
|
|
306
|
+
# Convert to URN using validation context
|
|
307
|
+
assert cls._validation_context, (
|
|
308
|
+
"Validation context must be set for URN creation"
|
|
309
|
+
)
|
|
310
|
+
urn_string = make_dataset_urn_with_platform_instance(
|
|
311
|
+
name=item,
|
|
312
|
+
platform=cls._validation_context.platform,
|
|
313
|
+
platform_instance=cls._validation_context.platform_instance,
|
|
314
|
+
env=cls._validation_context.env,
|
|
315
|
+
)
|
|
316
|
+
result.append(DatasetUrn.from_string(urn_string))
|
|
317
|
+
|
|
318
|
+
return result
|
|
319
|
+
|
|
256
320
|
@classmethod
|
|
257
321
|
def create(
|
|
258
322
|
cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
|
|
259
323
|
) -> "QueryEntry":
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
user=CorpUserUrn(entry_dict["user"]) if "user" in entry_dict else None,
|
|
268
|
-
operation_type=entry_dict.get("operation_type"),
|
|
269
|
-
downstream_tables=[
|
|
270
|
-
make_dataset_urn_with_platform_instance(
|
|
271
|
-
name=table,
|
|
272
|
-
platform=config.platform,
|
|
273
|
-
platform_instance=config.platform_instance,
|
|
274
|
-
env=config.env,
|
|
275
|
-
)
|
|
276
|
-
for table in entry_dict.get("downstream_tables", [])
|
|
277
|
-
],
|
|
278
|
-
upstream_tables=[
|
|
279
|
-
make_dataset_urn_with_platform_instance(
|
|
280
|
-
name=table,
|
|
281
|
-
platform=config.platform,
|
|
282
|
-
platform_instance=config.platform_instance,
|
|
283
|
-
env=config.env,
|
|
284
|
-
)
|
|
285
|
-
for table in entry_dict.get("upstream_tables", [])
|
|
286
|
-
],
|
|
287
|
-
session_id=entry_dict.get("session_id"),
|
|
288
|
-
)
|
|
324
|
+
"""Create QueryEntry from dict with config context."""
|
|
325
|
+
# Set validation context for URN creation
|
|
326
|
+
cls._validation_context = config
|
|
327
|
+
try:
|
|
328
|
+
return cls.parse_obj(entry_dict)
|
|
329
|
+
finally:
|
|
330
|
+
cls._validation_context = None
|
|
@@ -4,8 +4,9 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
|
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import logging
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
8
|
from datetime import datetime
|
|
8
|
-
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
9
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
|
|
9
10
|
from unittest.mock import patch
|
|
10
11
|
|
|
11
12
|
import cachetools
|
|
@@ -28,6 +29,7 @@ from databricks.sdk.service.sql import (
|
|
|
28
29
|
)
|
|
29
30
|
from databricks.sdk.service.workspace import ObjectType
|
|
30
31
|
from databricks.sql import connect
|
|
32
|
+
from databricks.sql.types import Row
|
|
31
33
|
|
|
32
34
|
from datahub._version import nice_version_name
|
|
33
35
|
from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
|
|
@@ -291,10 +293,59 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
291
293
|
method, path, body={**body, "page_token": response["next_page_token"]}
|
|
292
294
|
)
|
|
293
295
|
|
|
296
|
+
@cached(cachetools.FIFOCache(maxsize=100))
|
|
297
|
+
def get_catalog_column_lineage(self, catalog: str) -> Dict[str, Dict[str, dict]]:
|
|
298
|
+
"""Get column lineage for all tables in a catalog."""
|
|
299
|
+
logger.info(f"Fetching column lineage for catalog: {catalog}")
|
|
300
|
+
try:
|
|
301
|
+
query = """
|
|
302
|
+
SELECT
|
|
303
|
+
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
304
|
+
target_table_schema, target_table_name, target_column_name,
|
|
305
|
+
max(event_time)
|
|
306
|
+
FROM system.access.column_lineage
|
|
307
|
+
WHERE
|
|
308
|
+
target_table_catalog = %s
|
|
309
|
+
AND target_table_schema IS NOT NULL
|
|
310
|
+
AND target_table_name IS NOT NULL
|
|
311
|
+
AND target_column_name IS NOT NULL
|
|
312
|
+
AND source_table_catalog IS NOT NULL
|
|
313
|
+
AND source_table_schema IS NOT NULL
|
|
314
|
+
AND source_table_name IS NOT NULL
|
|
315
|
+
AND source_column_name IS NOT NULL
|
|
316
|
+
GROUP BY
|
|
317
|
+
source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
|
|
318
|
+
target_table_schema, target_table_name, target_column_name
|
|
319
|
+
"""
|
|
320
|
+
rows = self._execute_sql_query(query, (catalog,))
|
|
321
|
+
|
|
322
|
+
result_dict: Dict[str, Dict[str, dict]] = {}
|
|
323
|
+
for row in rows:
|
|
324
|
+
result_dict.setdefault(row["target_table_schema"], {}).setdefault(
|
|
325
|
+
row["target_table_name"], {}
|
|
326
|
+
).setdefault(row["target_column_name"], []).append(
|
|
327
|
+
# make fields look like the response from the older HTTP API
|
|
328
|
+
{
|
|
329
|
+
"catalog_name": row["source_table_catalog"],
|
|
330
|
+
"schema_name": row["source_table_schema"],
|
|
331
|
+
"table_name": row["source_table_name"],
|
|
332
|
+
"name": row["source_column_name"],
|
|
333
|
+
}
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return result_dict
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.warning(
|
|
339
|
+
f"Error getting column lineage for catalog {catalog}: {e}",
|
|
340
|
+
exc_info=True,
|
|
341
|
+
)
|
|
342
|
+
return {}
|
|
343
|
+
|
|
294
344
|
def list_lineages_by_table(
|
|
295
345
|
self, table_name: str, include_entity_lineage: bool
|
|
296
346
|
) -> dict:
|
|
297
347
|
"""List table lineage by table name."""
|
|
348
|
+
logger.debug(f"Getting table lineage for {table_name}")
|
|
298
349
|
return self._workspace_client.api_client.do( # type: ignore
|
|
299
350
|
method="GET",
|
|
300
351
|
path="/api/2.0/lineage-tracking/table-lineage",
|
|
@@ -304,13 +355,24 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
304
355
|
},
|
|
305
356
|
)
|
|
306
357
|
|
|
307
|
-
def list_lineages_by_column(self, table_name: str, column_name: str) ->
|
|
358
|
+
def list_lineages_by_column(self, table_name: str, column_name: str) -> list:
|
|
308
359
|
"""List column lineage by table name and column name."""
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
360
|
+
logger.debug(f"Getting column lineage for {table_name}.{column_name}")
|
|
361
|
+
try:
|
|
362
|
+
return (
|
|
363
|
+
self._workspace_client.api_client.do( # type: ignore
|
|
364
|
+
"GET",
|
|
365
|
+
"/api/2.0/lineage-tracking/column-lineage",
|
|
366
|
+
body={"table_name": table_name, "column_name": column_name},
|
|
367
|
+
).get("upstream_cols")
|
|
368
|
+
or []
|
|
369
|
+
)
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.warning(
|
|
372
|
+
f"Error getting column lineage on table {table_name}, column {column_name}: {e}",
|
|
373
|
+
exc_info=True,
|
|
374
|
+
)
|
|
375
|
+
return []
|
|
314
376
|
|
|
315
377
|
def table_lineage(self, table: Table, include_entity_lineage: bool) -> None:
|
|
316
378
|
if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
|
|
@@ -348,23 +410,51 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
348
410
|
f"Error getting lineage on table {table.ref}: {e}", exc_info=True
|
|
349
411
|
)
|
|
350
412
|
|
|
351
|
-
def get_column_lineage(
|
|
413
|
+
def get_column_lineage(
|
|
414
|
+
self,
|
|
415
|
+
table: Table,
|
|
416
|
+
column_names: List[str],
|
|
417
|
+
*,
|
|
418
|
+
max_workers: Optional[int] = None,
|
|
419
|
+
) -> None:
|
|
352
420
|
try:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
421
|
+
# use the newer system tables if we have a SQL warehouse, otherwise fall back
|
|
422
|
+
# and use the older (and much slower) HTTP API.
|
|
423
|
+
if self.warehouse_id:
|
|
424
|
+
lineage = (
|
|
425
|
+
self.get_catalog_column_lineage(table.ref.catalog)
|
|
426
|
+
.get(table.ref.schema, {})
|
|
427
|
+
.get(table.ref.table, {})
|
|
360
428
|
)
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
429
|
+
else:
|
|
430
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
431
|
+
futures = [
|
|
432
|
+
executor.submit(
|
|
433
|
+
self.list_lineages_by_column,
|
|
434
|
+
table.ref.qualified_table_name,
|
|
435
|
+
column_name,
|
|
436
|
+
)
|
|
437
|
+
for column_name in column_names
|
|
438
|
+
]
|
|
439
|
+
lineage = {
|
|
440
|
+
column_name: future.result()
|
|
441
|
+
for column_name, future in zip(column_names, futures)
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
for column_name in column_names:
|
|
445
|
+
for item in lineage.get(column_name) or []:
|
|
446
|
+
table_ref = TableReference.create_from_lineage(
|
|
447
|
+
item,
|
|
448
|
+
table.schema.catalog.metastore,
|
|
449
|
+
)
|
|
450
|
+
if table_ref:
|
|
451
|
+
table.upstreams.setdefault(table_ref, {}).setdefault(
|
|
452
|
+
column_name, []
|
|
453
|
+
).append(item["name"])
|
|
454
|
+
|
|
365
455
|
except Exception as e:
|
|
366
456
|
logger.warning(
|
|
367
|
-
f"Error getting column lineage on table {table.ref}
|
|
457
|
+
f"Error getting column lineage on table {table.ref}: {e}",
|
|
368
458
|
exc_info=True,
|
|
369
459
|
)
|
|
370
460
|
|
|
@@ -504,14 +594,14 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
504
594
|
executed_as_user_name=info.executed_as_user_name,
|
|
505
595
|
)
|
|
506
596
|
|
|
507
|
-
def _execute_sql_query(self, query: str) -> List[
|
|
597
|
+
def _execute_sql_query(self, query: str, params: Sequence[Any] = ()) -> List[Row]:
|
|
508
598
|
"""Execute SQL query using databricks-sql connector for better performance"""
|
|
509
599
|
try:
|
|
510
600
|
with (
|
|
511
601
|
connect(**self._sql_connection_params) as connection,
|
|
512
602
|
connection.cursor() as cursor,
|
|
513
603
|
):
|
|
514
|
-
cursor.execute(query)
|
|
604
|
+
cursor.execute(query, list(params))
|
|
515
605
|
return cursor.fetchall()
|
|
516
606
|
|
|
517
607
|
except Exception as e:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
import time
|
|
4
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
5
4
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
6
5
|
from urllib.parse import urljoin
|
|
7
6
|
|
|
@@ -657,15 +656,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
657
656
|
if len(table.columns) > self.config.column_lineage_column_limit:
|
|
658
657
|
self.report.num_column_lineage_skipped_column_count += 1
|
|
659
658
|
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
column.name,
|
|
668
|
-
)
|
|
659
|
+
column_names = [
|
|
660
|
+
column.name
|
|
661
|
+
for column in table.columns[: self.config.column_lineage_column_limit]
|
|
662
|
+
]
|
|
663
|
+
self.unity_catalog_api_proxy.get_column_lineage(
|
|
664
|
+
table, column_names, max_workers=self.config.lineage_max_workers
|
|
665
|
+
)
|
|
669
666
|
|
|
670
667
|
return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
|
|
671
668
|
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -4319,6 +4319,14 @@
|
|
|
4319
4319
|
"doc": "The type of upstream entity"
|
|
4320
4320
|
},
|
|
4321
4321
|
{
|
|
4322
|
+
"Searchable": {
|
|
4323
|
+
"/*": {
|
|
4324
|
+
"fieldName": "fineGrainedUpstreams",
|
|
4325
|
+
"fieldType": "URN",
|
|
4326
|
+
"hasValuesFieldName": "hasFineGrainedUpstreams",
|
|
4327
|
+
"queryByDefault": false
|
|
4328
|
+
}
|
|
4329
|
+
},
|
|
4322
4330
|
"Urn": "Urn",
|
|
4323
4331
|
"urn_is_array": true,
|
|
4324
4332
|
"type": [
|
|
@@ -12875,6 +12883,7 @@
|
|
|
12875
12883
|
"Searchable": {
|
|
12876
12884
|
"fieldName": "upstreams",
|
|
12877
12885
|
"fieldType": "URN",
|
|
12886
|
+
"hasValuesFieldName": "hasUpstreams",
|
|
12878
12887
|
"queryByDefault": false
|
|
12879
12888
|
},
|
|
12880
12889
|
"java": {
|
|
@@ -375,6 +375,14 @@
|
|
|
375
375
|
"doc": "The type of upstream entity"
|
|
376
376
|
},
|
|
377
377
|
{
|
|
378
|
+
"Searchable": {
|
|
379
|
+
"/*": {
|
|
380
|
+
"fieldName": "fineGrainedUpstreams",
|
|
381
|
+
"fieldType": "URN",
|
|
382
|
+
"hasValuesFieldName": "hasFineGrainedUpstreams",
|
|
383
|
+
"queryByDefault": false
|
|
384
|
+
}
|
|
385
|
+
},
|
|
378
386
|
"type": [
|
|
379
387
|
"null",
|
|
380
388
|
{
|
|
@@ -3070,6 +3070,14 @@
|
|
|
3070
3070
|
"doc": "The type of upstream entity"
|
|
3071
3071
|
},
|
|
3072
3072
|
{
|
|
3073
|
+
"Searchable": {
|
|
3074
|
+
"/*": {
|
|
3075
|
+
"fieldName": "fineGrainedUpstreams",
|
|
3076
|
+
"fieldType": "URN",
|
|
3077
|
+
"hasValuesFieldName": "hasFineGrainedUpstreams",
|
|
3078
|
+
"queryByDefault": false
|
|
3079
|
+
}
|
|
3080
|
+
},
|
|
3073
3081
|
"type": [
|
|
3074
3082
|
"null",
|
|
3075
3083
|
{
|
|
@@ -3691,6 +3699,7 @@
|
|
|
3691
3699
|
"Searchable": {
|
|
3692
3700
|
"fieldName": "upstreams",
|
|
3693
3701
|
"fieldType": "URN",
|
|
3702
|
+
"hasValuesFieldName": "hasUpstreams",
|
|
3694
3703
|
"queryByDefault": false
|
|
3695
3704
|
},
|
|
3696
3705
|
"java": {
|
|
@@ -94,6 +94,7 @@
|
|
|
94
94
|
"Searchable": {
|
|
95
95
|
"fieldName": "upstreams",
|
|
96
96
|
"fieldType": "URN",
|
|
97
|
+
"hasValuesFieldName": "hasUpstreams",
|
|
97
98
|
"queryByDefault": false
|
|
98
99
|
},
|
|
99
100
|
"java": {
|
|
@@ -199,6 +200,14 @@
|
|
|
199
200
|
"doc": "The type of upstream entity"
|
|
200
201
|
},
|
|
201
202
|
{
|
|
203
|
+
"Searchable": {
|
|
204
|
+
"/*": {
|
|
205
|
+
"fieldName": "fineGrainedUpstreams",
|
|
206
|
+
"fieldType": "URN",
|
|
207
|
+
"hasValuesFieldName": "hasFineGrainedUpstreams",
|
|
208
|
+
"queryByDefault": false
|
|
209
|
+
}
|
|
210
|
+
},
|
|
202
211
|
"type": [
|
|
203
212
|
"null",
|
|
204
213
|
{
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|