acryl-datahub 1.2.0.2rc3__py3-none-any.whl → 1.2.0.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.2rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=cyLeAlWzIql-O65xt-MALgl-0jJ-qmzJ-gFq-t6FkdE,323
4
+ datahub/_version.py,sha256=BAFY2OaLzkEm6Hs8RwoQ69XvJQdk3iPPrc9oCg1xGAE,323
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -227,7 +227,7 @@ datahub/ingestion/source/pulsar.py,sha256=u5F8QnCLJsht5-7XCiUTsnfhCPIpKVB_l32CgM
227
227
  datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99WdvcYiA,30653
228
228
  datahub/ingestion/source/salesforce.py,sha256=Pa_w1XszxFd8fyhpSWOfc2nOnevHwwstIvnRrQT4R9M,40584
229
229
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
230
- datahub/ingestion/source/sql_queries.py,sha256=1SdEZGAmojfWbU1xKbezH6uqwRd2G0rgpK_Sh9MRj1U,11791
230
+ datahub/ingestion/source/sql_queries.py,sha256=9ICUC6tpXpxvtwfK-9lytJzFcLe8MrNlciwy9DIFM-4,13764
231
231
  datahub/ingestion/source/superset.py,sha256=oi7F2jlvkVr9ItJ_r1Jm4bYfXHYu4vPAFPMPaGJKB84,50608
232
232
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
233
233
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
@@ -524,7 +524,7 @@ datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVH
524
524
  datahub/ingestion/source/sql/teradata.py,sha256=xL_c_UEM_JT-xoMw5Nb5UvSNBUfTGol5CpOkgK5Bsjk,65412
525
525
  datahub/ingestion/source/sql/trino.py,sha256=zIfQ6GvW8Sbw4sxqsTcnibT51STka_nzNYvmld6HfHw,18947
526
526
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=AB3Gtx4omAy_08zadHQpmUGmIGufkZ6o_ihWNnfvzYc,5783
527
- datahub/ingestion/source/sql/vertica.py,sha256=MeohL8j68ISES1RhrBXLQlkT_YqgT-AvHRxuVCJSMbE,33458
527
+ datahub/ingestion/source/sql/vertica.py,sha256=blnu1-H7vnSQD3ZD5QTotoQ2DQJWJeR0uxz_clxiPGo,33518
528
528
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
529
529
  datahub/ingestion/source/sql/mssql/job_models.py,sha256=nAo3rciu-w2-dXCz6_ekDEbGMEjCMEfh8WvSfXoF2l0,9359
530
530
  datahub/ingestion/source/sql/mssql/source.py,sha256=Uise_u6yXKU__9B_U3D3yObWNIVDzrz2AgEDZOlk6bQ,43101
@@ -558,11 +558,11 @@ datahub/ingestion/source/unity/config.py,sha256=7QosoBthg9kirHfXev_vhefkobUxYnp1
558
558
  datahub/ingestion/source/unity/connection_test.py,sha256=B143Wb28fS0V4GhygU9hzKqiArWBjsQO54IUCPf23dc,2586
559
559
  datahub/ingestion/source/unity/ge_profiler.py,sha256=NBRHZceq-f95iUn7u0h7cgcd9nAc48Aa-lmp_BqE0As,8409
560
560
  datahub/ingestion/source/unity/hive_metastore_proxy.py,sha256=IAWWJjaW0si_UF52Se2D7wmdYRY_afUG4QlVmQu6xaw,15351
561
- datahub/ingestion/source/unity/proxy.py,sha256=jfQ1N8Xrp08zeYN2j74YTweusygXtK4Q-5_FBbwCVTE,22803
561
+ datahub/ingestion/source/unity/proxy.py,sha256=iZ2ftKOXkxpFr0_2bEYEm31ci9OZJWFYgna3DNLCXrQ,26706
562
562
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
563
563
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
564
564
  datahub/ingestion/source/unity/report.py,sha256=XFT9oQfvEB4RkTvWGgFOoQuLPUN_AIoPXZ79xeDhGHQ,2831
565
- datahub/ingestion/source/unity/source.py,sha256=47sWCYb3pd21RjIqs9NH4h6VMtF_YMvqS9-6cegGi1w,49980
565
+ datahub/ingestion/source/unity/source.py,sha256=udK1_WI7RO2Uzx9xis0mx264nHXDiMeP4Kah3-CFXis,49832
566
566
  datahub/ingestion/source/unity/tag_entities.py,sha256=iWl6nRAWSye1hoFDx_Xh4aT53PN0sGzlX7n1-oTVUv8,11568
567
567
  datahub/ingestion/source/unity/usage.py,sha256=0wETBAaZvHI_EGgBlxX3bKsVHEAdnUV8_bKI_lbyWjY,11500
568
568
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -627,7 +627,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
627
627
  datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
628
628
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
629
629
  datahub/metadata/_internal_schema_classes.py,sha256=Zh2volhvkUCENRavXDwPsmwfRe62k8_O6f5QT8_bh-g,1051205
630
- datahub/metadata/schema.avsc,sha256=4X6Jx5TFcOGY8Qxdm-FSgbGkzG2wND992brsxwgumSU,737966
630
+ datahub/metadata/schema.avsc,sha256=u5iUlz9AnFfJijjJ9xcZx3MoiBfxWSmhr7pZIOg0tMo,738363
631
631
  datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
632
632
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
633
633
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -781,7 +781,7 @@ datahub/metadata/schemas/DataHubUpgradeResult.avsc,sha256=VydVb4yqjIviR73-T6TooF
781
781
  datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdPQYwvhhv9RQv6gEeg,11642
782
782
  datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
783
783
  datahub/metadata/schemas/DataJobInfo.avsc,sha256=Bc9qdDcXI0GQdEgNTpgHaBbnrppDKQ-1xR26diOSVIQ,7488
784
- datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
784
+ datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=BYKImZ8kQQHqWbSBMKXWD0tGi96yzUt8zJFW3_twVVM,15575
785
785
  datahub/metadata/schemas/DataJobKey.avsc,sha256=S7egH8jWjKW52MG6Pg7plDoP15XfTTiMde5V6nR6ycE,1624
786
786
  datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
787
787
  datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
@@ -884,7 +884,7 @@ datahub/metadata/schemas/MLModelProperties.avsc,sha256=hDCBHxGe-cmCBeU1k0ANuQlKj
884
884
  datahub/metadata/schemas/MLPrimaryKeyKey.avsc,sha256=F3lgpMnHBhXsqGncHE9x06P-0RiNCrzbUUWlMkPJxFI,1132
885
885
  datahub/metadata/schemas/MLPrimaryKeyProperties.avsc,sha256=URIuOpS93RVk8MZVcbZ-dmTwu_cN3KSOKxSR8fm-eTo,6744
886
886
  datahub/metadata/schemas/MLTrainingRunProperties.avsc,sha256=WGgj0MuQrGD4UgvyHCJHzTnHja2LlJTOr1gLu8SySj0,4269
887
- datahub/metadata/schemas/MetadataChangeEvent.avsc,sha256=l3tVuQces7sKrwWsaIJrn3nMRUiCl3MHqCJJHcw7Ylc,377705
887
+ datahub/metadata/schemas/MetadataChangeEvent.avsc,sha256=oNK0N8WrBsM_AoZkdYAMJQlhYzbao_QWaAMOjqEvPBw,378228
888
888
  datahub/metadata/schemas/MetadataChangeLog.avsc,sha256=soCmgrcEBE5yS-mQIm-RIefhb74ONj9Fqayxa0-59KE,13254
889
889
  datahub/metadata/schemas/MetadataChangeProposal.avsc,sha256=pT14vUmpj7VJ8hinQ0pcCUtRKx6RAGHWh1eJixkqaE8,12647
890
890
  datahub/metadata/schemas/Metrics.avsc,sha256=O7DJGjOwmHbb1x_Zj7AuM_HaHKjBvkfJKfUsX8icXD4,690
@@ -933,7 +933,7 @@ datahub/metadata/schemas/TestInfo.avsc,sha256=rye90gdY_lxZt_1gpa_Xum923CJgDU6i_e
933
933
  datahub/metadata/schemas/TestKey.avsc,sha256=eL-S4Z8EuN1JEXV1t4fy3LwmdA2dJURasFcKygP2rLY,421
934
934
  datahub/metadata/schemas/TestResults.avsc,sha256=uspC95AzRvz2_AgHVb5-fxELm5u8NmBTaFVJvGunmh0,5178
935
935
  datahub/metadata/schemas/TrainingData.avsc,sha256=7p7sFBA_UyV5IbNU5qLgS3vVu70yevKCfJKSGmTzVTg,2069
936
- datahub/metadata/schemas/UpstreamLineage.avsc,sha256=iaeFRbL2aVSYFwj-HQHyfIVaHRrK3kLbkkLXgIfJTsk,10639
936
+ datahub/metadata/schemas/UpstreamLineage.avsc,sha256=dtpI7KUv9kYyGZmIlKfR2zLwgqsHO5P20egvIeup1EU,11000
937
937
  datahub/metadata/schemas/UsageAggregation.avsc,sha256=QaF6lyWGUq8IlRel2h4qIXOXCMxBhrwjoaUELsd-I6g,4538
938
938
  datahub/metadata/schemas/VersionInfo.avsc,sha256=9gMcZ8tjuhgcZiq2gOAp_EOV9q9jvuOgfph6m6v_X7c,1189
939
939
  datahub/metadata/schemas/VersionProperties.avsc,sha256=ME8V01JzG8lEsLXgYWnSYCehmpPcvv1UbE5Y8-8Ys9k,8022
@@ -1101,8 +1101,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1101
1101
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1102
1102
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1103
1103
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1104
- acryl_datahub-1.2.0.2rc3.dist-info/METADATA,sha256=ixYC_JxEXERi_Ik-6tXxNn5W0EMN_poTxliZ99bCS6o,182014
1105
- acryl_datahub-1.2.0.2rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1106
- acryl_datahub-1.2.0.2rc3.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1107
- acryl_datahub-1.2.0.2rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1108
- acryl_datahub-1.2.0.2rc3.dist-info/RECORD,,
1104
+ acryl_datahub-1.2.0.3rc1.dist-info/METADATA,sha256=zo0PhZMaumsiXe8Vq8ud1VQPSZWVspaAuUIx1FoCk9s,182014
1105
+ acryl_datahub-1.2.0.3rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1106
+ acryl_datahub-1.2.0.3rc1.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1107
+ acryl_datahub-1.2.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1108
+ acryl_datahub-1.2.0.3rc1.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.2rc3"
3
+ __version__ = "1.2.0.3rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
5
5
 
6
6
  import pydantic
7
+ import pytest
7
8
  from pydantic.class_validators import validator
8
9
  from vertica_sqlalchemy_dialect.base import VerticaInspector
9
10
 
@@ -55,6 +56,8 @@ from datahub.utilities import config_clean
55
56
 
56
57
  if TYPE_CHECKING:
57
58
  from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
59
+
60
+ pytestmark = pytest.mark.integration_batch_4
58
61
  logger: logging.Logger = logging.getLogger(__name__)
59
62
 
60
63
 
@@ -2,12 +2,13 @@ import json
2
2
  import logging
3
3
  import os
4
4
  from dataclasses import dataclass
5
- from datetime import datetime, timezone
5
+ from datetime import datetime
6
6
  from functools import partial
7
- from typing import Iterable, List, Optional, Union
7
+ from typing import ClassVar, Iterable, List, Optional, Union
8
8
 
9
- from pydantic import Field
9
+ from pydantic import BaseModel, Field, validator
10
10
 
11
+ from datahub.configuration.datetimes import parse_user_datetime
11
12
  from datahub.configuration.source_common import (
12
13
  EnvConfigMixin,
13
14
  PlatformInstanceConfigMixin,
@@ -35,7 +36,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
35
36
  from datahub.ingestion.graph.client import DataHubGraph
36
37
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
37
38
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
38
- from datahub.metadata.urns import CorpUserUrn
39
+ from datahub.metadata.urns import CorpUserUrn, DatasetUrn
39
40
  from datahub.sql_parsing.schema_resolver import SchemaResolver
40
41
  from datahub.sql_parsing.sql_parsing_aggregator import (
41
42
  KnownQueryLineageInfo,
@@ -208,19 +209,40 @@ class SqlQueriesSource(Source):
208
209
  def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
209
210
  """Add a query to the SQL parsing aggregator."""
210
211
  try:
211
- # If we have explicit lineage, use it directly
212
- if query_entry.upstream_tables or query_entry.downstream_tables:
212
+ # If we have both upstream and downstream tables, use explicit lineage
213
+ if query_entry.upstream_tables and query_entry.downstream_tables:
213
214
  logger.debug("Using explicit lineage from query file")
214
215
  for downstream_table in query_entry.downstream_tables:
215
216
  known_lineage = KnownQueryLineageInfo(
216
217
  query_text=query_entry.query,
217
- downstream=downstream_table,
218
- upstreams=query_entry.upstream_tables,
218
+ downstream=str(downstream_table),
219
+ upstreams=[str(urn) for urn in query_entry.upstream_tables],
219
220
  timestamp=query_entry.timestamp,
220
221
  session_id=query_entry.session_id,
221
222
  )
222
223
  self.aggregator.add_known_query_lineage(known_lineage)
223
224
  else:
225
+ # Warn if only partial lineage information is provided
226
+ # XOR: true if exactly one of upstream_tables or downstream_tables is provided
227
+ if bool(query_entry.upstream_tables) ^ bool(
228
+ query_entry.downstream_tables
229
+ ):
230
+ query_preview = (
231
+ query_entry.query[:150] + "..."
232
+ if len(query_entry.query) > 150
233
+ else query_entry.query
234
+ )
235
+ missing_upstream = (
236
+ "Missing upstream. " if not query_entry.upstream_tables else ""
237
+ )
238
+ missing_downstream = (
239
+ "Missing downstream. "
240
+ if not query_entry.downstream_tables
241
+ else ""
242
+ )
243
+ logger.info(
244
+ f"Only partial lineage information provided, falling back to SQL parsing for complete lineage detection. {missing_upstream}{missing_downstream}Query: {query_preview}"
245
+ )
224
246
  # No explicit lineage, rely on parsing
225
247
  observed_query = ObservedQuery(
226
248
  query=query_entry.query,
@@ -243,46 +265,66 @@ class SqlQueriesSource(Source):
243
265
  )
244
266
 
245
267
 
246
- @dataclass
247
- class QueryEntry:
268
+ class QueryEntry(BaseModel):
248
269
  query: str
249
- timestamp: Optional[datetime]
250
- user: Optional[CorpUserUrn]
251
- operation_type: Optional[str]
252
- downstream_tables: List[str]
253
- upstream_tables: List[str]
270
+ timestamp: Optional[datetime] = None
271
+ user: Optional[CorpUserUrn] = None
272
+ operation_type: Optional[str] = None
273
+ downstream_tables: List[DatasetUrn] = Field(default_factory=list)
274
+ upstream_tables: List[DatasetUrn] = Field(default_factory=list)
254
275
  session_id: Optional[str] = None
255
276
 
277
+ # Validation context for URN creation
278
+ _validation_context: ClassVar[Optional[SqlQueriesSourceConfig]] = None
279
+
280
+ class Config:
281
+ arbitrary_types_allowed = True
282
+
283
+ @validator("timestamp", pre=True)
284
+ def parse_timestamp(cls, v):
285
+ return None if v is None else parse_user_datetime(str(v))
286
+
287
+ @validator("user", pre=True)
288
+ def parse_user(cls, v):
289
+ if v is None:
290
+ return None
291
+
292
+ return v if isinstance(v, CorpUserUrn) else CorpUserUrn(v)
293
+
294
+ @validator("downstream_tables", "upstream_tables", pre=True)
295
+ def parse_tables(cls, v):
296
+ if not v:
297
+ return []
298
+
299
+ result = []
300
+ for item in v:
301
+ if isinstance(item, DatasetUrn):
302
+ result.append(item)
303
+ elif isinstance(item, str):
304
+ # Skip empty/whitespace-only strings
305
+ if item and item.strip():
306
+ # Convert to URN using validation context
307
+ assert cls._validation_context, (
308
+ "Validation context must be set for URN creation"
309
+ )
310
+ urn_string = make_dataset_urn_with_platform_instance(
311
+ name=item,
312
+ platform=cls._validation_context.platform,
313
+ platform_instance=cls._validation_context.platform_instance,
314
+ env=cls._validation_context.env,
315
+ )
316
+ result.append(DatasetUrn.from_string(urn_string))
317
+
318
+ return result
319
+
256
320
  @classmethod
257
321
  def create(
258
322
  cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
259
323
  ) -> "QueryEntry":
260
- return cls(
261
- query=entry_dict["query"],
262
- timestamp=(
263
- datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc)
264
- if "timestamp" in entry_dict
265
- else None
266
- ),
267
- user=CorpUserUrn(entry_dict["user"]) if "user" in entry_dict else None,
268
- operation_type=entry_dict.get("operation_type"),
269
- downstream_tables=[
270
- make_dataset_urn_with_platform_instance(
271
- name=table,
272
- platform=config.platform,
273
- platform_instance=config.platform_instance,
274
- env=config.env,
275
- )
276
- for table in entry_dict.get("downstream_tables", [])
277
- ],
278
- upstream_tables=[
279
- make_dataset_urn_with_platform_instance(
280
- name=table,
281
- platform=config.platform,
282
- platform_instance=config.platform_instance,
283
- env=config.env,
284
- )
285
- for table in entry_dict.get("upstream_tables", [])
286
- ],
287
- session_id=entry_dict.get("session_id"),
288
- )
324
+ """Create QueryEntry from dict with config context."""
325
+ # Set validation context for URN creation
326
+ cls._validation_context = config
327
+ try:
328
+ return cls.parse_obj(entry_dict)
329
+ finally:
330
+ cls._validation_context = None
@@ -4,8 +4,9 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
4
4
 
5
5
  import dataclasses
6
6
  import logging
7
+ from concurrent.futures import ThreadPoolExecutor
7
8
  from datetime import datetime
8
- from typing import Any, Dict, Iterable, List, Optional, Union, cast
9
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast
9
10
  from unittest.mock import patch
10
11
 
11
12
  import cachetools
@@ -28,6 +29,7 @@ from databricks.sdk.service.sql import (
28
29
  )
29
30
  from databricks.sdk.service.workspace import ObjectType
30
31
  from databricks.sql import connect
32
+ from databricks.sql.types import Row
31
33
 
32
34
  from datahub._version import nice_version_name
33
35
  from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
@@ -291,10 +293,59 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
291
293
  method, path, body={**body, "page_token": response["next_page_token"]}
292
294
  )
293
295
 
296
+ @cached(cachetools.FIFOCache(maxsize=100))
297
+ def get_catalog_column_lineage(self, catalog: str) -> Dict[str, Dict[str, dict]]:
298
+ """Get column lineage for all tables in a catalog."""
299
+ logger.info(f"Fetching column lineage for catalog: {catalog}")
300
+ try:
301
+ query = """
302
+ SELECT
303
+ source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
304
+ target_table_schema, target_table_name, target_column_name,
305
+ max(event_time)
306
+ FROM system.access.column_lineage
307
+ WHERE
308
+ target_table_catalog = %s
309
+ AND target_table_schema IS NOT NULL
310
+ AND target_table_name IS NOT NULL
311
+ AND target_column_name IS NOT NULL
312
+ AND source_table_catalog IS NOT NULL
313
+ AND source_table_schema IS NOT NULL
314
+ AND source_table_name IS NOT NULL
315
+ AND source_column_name IS NOT NULL
316
+ GROUP BY
317
+ source_table_catalog, source_table_schema, source_table_name, source_column_name, source_type,
318
+ target_table_schema, target_table_name, target_column_name
319
+ """
320
+ rows = self._execute_sql_query(query, (catalog,))
321
+
322
+ result_dict: Dict[str, Dict[str, dict]] = {}
323
+ for row in rows:
324
+ result_dict.setdefault(row["target_table_schema"], {}).setdefault(
325
+ row["target_table_name"], {}
326
+ ).setdefault(row["target_column_name"], []).append(
327
+ # make fields look like the response from the older HTTP API
328
+ {
329
+ "catalog_name": row["source_table_catalog"],
330
+ "schema_name": row["source_table_schema"],
331
+ "table_name": row["source_table_name"],
332
+ "name": row["source_column_name"],
333
+ }
334
+ )
335
+
336
+ return result_dict
337
+ except Exception as e:
338
+ logger.warning(
339
+ f"Error getting column lineage for catalog {catalog}: {e}",
340
+ exc_info=True,
341
+ )
342
+ return {}
343
+
294
344
  def list_lineages_by_table(
295
345
  self, table_name: str, include_entity_lineage: bool
296
346
  ) -> dict:
297
347
  """List table lineage by table name."""
348
+ logger.debug(f"Getting table lineage for {table_name}")
298
349
  return self._workspace_client.api_client.do( # type: ignore
299
350
  method="GET",
300
351
  path="/api/2.0/lineage-tracking/table-lineage",
@@ -304,13 +355,24 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
304
355
  },
305
356
  )
306
357
 
307
- def list_lineages_by_column(self, table_name: str, column_name: str) -> dict:
358
+ def list_lineages_by_column(self, table_name: str, column_name: str) -> list:
308
359
  """List column lineage by table name and column name."""
309
- return self._workspace_client.api_client.do( # type: ignore
310
- "GET",
311
- "/api/2.0/lineage-tracking/column-lineage",
312
- body={"table_name": table_name, "column_name": column_name},
313
- )
360
+ logger.debug(f"Getting column lineage for {table_name}.{column_name}")
361
+ try:
362
+ return (
363
+ self._workspace_client.api_client.do( # type: ignore
364
+ "GET",
365
+ "/api/2.0/lineage-tracking/column-lineage",
366
+ body={"table_name": table_name, "column_name": column_name},
367
+ ).get("upstream_cols")
368
+ or []
369
+ )
370
+ except Exception as e:
371
+ logger.warning(
372
+ f"Error getting column lineage on table {table_name}, column {column_name}: {e}",
373
+ exc_info=True,
374
+ )
375
+ return []
314
376
 
315
377
  def table_lineage(self, table: Table, include_entity_lineage: bool) -> None:
316
378
  if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
@@ -348,23 +410,51 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
348
410
  f"Error getting lineage on table {table.ref}: {e}", exc_info=True
349
411
  )
350
412
 
351
- def get_column_lineage(self, table: Table, column_name: str) -> None:
413
+ def get_column_lineage(
414
+ self,
415
+ table: Table,
416
+ column_names: List[str],
417
+ *,
418
+ max_workers: Optional[int] = None,
419
+ ) -> None:
352
420
  try:
353
- response: dict = self.list_lineages_by_column(
354
- table_name=table.ref.qualified_table_name,
355
- column_name=column_name,
356
- )
357
- for item in response.get("upstream_cols") or []:
358
- table_ref = TableReference.create_from_lineage(
359
- item, table.schema.catalog.metastore
421
+ # use the newer system tables if we have a SQL warehouse, otherwise fall back
422
+ # and use the older (and much slower) HTTP API.
423
+ if self.warehouse_id:
424
+ lineage = (
425
+ self.get_catalog_column_lineage(table.ref.catalog)
426
+ .get(table.ref.schema, {})
427
+ .get(table.ref.table, {})
360
428
  )
361
- if table_ref:
362
- table.upstreams.setdefault(table_ref, {}).setdefault(
363
- column_name, []
364
- ).append(item["name"])
429
+ else:
430
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
431
+ futures = [
432
+ executor.submit(
433
+ self.list_lineages_by_column,
434
+ table.ref.qualified_table_name,
435
+ column_name,
436
+ )
437
+ for column_name in column_names
438
+ ]
439
+ lineage = {
440
+ column_name: future.result()
441
+ for column_name, future in zip(column_names, futures)
442
+ }
443
+
444
+ for column_name in column_names:
445
+ for item in lineage.get(column_name) or []:
446
+ table_ref = TableReference.create_from_lineage(
447
+ item,
448
+ table.schema.catalog.metastore,
449
+ )
450
+ if table_ref:
451
+ table.upstreams.setdefault(table_ref, {}).setdefault(
452
+ column_name, []
453
+ ).append(item["name"])
454
+
365
455
  except Exception as e:
366
456
  logger.warning(
367
- f"Error getting column lineage on table {table.ref}, column {column_name}: {e}",
457
+ f"Error getting column lineage on table {table.ref}: {e}",
368
458
  exc_info=True,
369
459
  )
370
460
 
@@ -504,14 +594,14 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
504
594
  executed_as_user_name=info.executed_as_user_name,
505
595
  )
506
596
 
507
- def _execute_sql_query(self, query: str) -> List[List[str]]:
597
+ def _execute_sql_query(self, query: str, params: Sequence[Any] = ()) -> List[Row]:
508
598
  """Execute SQL query using databricks-sql connector for better performance"""
509
599
  try:
510
600
  with (
511
601
  connect(**self._sql_connection_params) as connection,
512
602
  connection.cursor() as cursor,
513
603
  ):
514
- cursor.execute(query)
604
+ cursor.execute(query, list(params))
515
605
  return cursor.fetchall()
516
606
 
517
607
  except Exception as e:
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  import re
3
3
  import time
4
- from concurrent.futures import ThreadPoolExecutor
5
4
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
6
5
  from urllib.parse import urljoin
7
6
 
@@ -657,15 +656,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
657
656
  if len(table.columns) > self.config.column_lineage_column_limit:
658
657
  self.report.num_column_lineage_skipped_column_count += 1
659
658
 
660
- with ThreadPoolExecutor(
661
- max_workers=self.config.lineage_max_workers
662
- ) as executor:
663
- for column in table.columns[: self.config.column_lineage_column_limit]:
664
- executor.submit(
665
- self.unity_catalog_api_proxy.get_column_lineage,
666
- table,
667
- column.name,
668
- )
659
+ column_names = [
660
+ column.name
661
+ for column in table.columns[: self.config.column_lineage_column_limit]
662
+ ]
663
+ self.unity_catalog_api_proxy.get_column_lineage(
664
+ table, column_names, max_workers=self.config.lineage_max_workers
665
+ )
669
666
 
670
667
  return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table)
671
668
 
@@ -4319,6 +4319,14 @@
4319
4319
  "doc": "The type of upstream entity"
4320
4320
  },
4321
4321
  {
4322
+ "Searchable": {
4323
+ "/*": {
4324
+ "fieldName": "fineGrainedUpstreams",
4325
+ "fieldType": "URN",
4326
+ "hasValuesFieldName": "hasFineGrainedUpstreams",
4327
+ "queryByDefault": false
4328
+ }
4329
+ },
4322
4330
  "Urn": "Urn",
4323
4331
  "urn_is_array": true,
4324
4332
  "type": [
@@ -12875,6 +12883,7 @@
12875
12883
  "Searchable": {
12876
12884
  "fieldName": "upstreams",
12877
12885
  "fieldType": "URN",
12886
+ "hasValuesFieldName": "hasUpstreams",
12878
12887
  "queryByDefault": false
12879
12888
  },
12880
12889
  "java": {
@@ -375,6 +375,14 @@
375
375
  "doc": "The type of upstream entity"
376
376
  },
377
377
  {
378
+ "Searchable": {
379
+ "/*": {
380
+ "fieldName": "fineGrainedUpstreams",
381
+ "fieldType": "URN",
382
+ "hasValuesFieldName": "hasFineGrainedUpstreams",
383
+ "queryByDefault": false
384
+ }
385
+ },
378
386
  "type": [
379
387
  "null",
380
388
  {
@@ -3070,6 +3070,14 @@
3070
3070
  "doc": "The type of upstream entity"
3071
3071
  },
3072
3072
  {
3073
+ "Searchable": {
3074
+ "/*": {
3075
+ "fieldName": "fineGrainedUpstreams",
3076
+ "fieldType": "URN",
3077
+ "hasValuesFieldName": "hasFineGrainedUpstreams",
3078
+ "queryByDefault": false
3079
+ }
3080
+ },
3073
3081
  "type": [
3074
3082
  "null",
3075
3083
  {
@@ -3691,6 +3699,7 @@
3691
3699
  "Searchable": {
3692
3700
  "fieldName": "upstreams",
3693
3701
  "fieldType": "URN",
3702
+ "hasValuesFieldName": "hasUpstreams",
3694
3703
  "queryByDefault": false
3695
3704
  },
3696
3705
  "java": {
@@ -94,6 +94,7 @@
94
94
  "Searchable": {
95
95
  "fieldName": "upstreams",
96
96
  "fieldType": "URN",
97
+ "hasValuesFieldName": "hasUpstreams",
97
98
  "queryByDefault": false
98
99
  },
99
100
  "java": {
@@ -199,6 +200,14 @@
199
200
  "doc": "The type of upstream entity"
200
201
  },
201
202
  {
203
+ "Searchable": {
204
+ "/*": {
205
+ "fieldName": "fineGrainedUpstreams",
206
+ "fieldType": "URN",
207
+ "hasValuesFieldName": "hasFineGrainedUpstreams",
208
+ "queryByDefault": false
209
+ }
210
+ },
202
211
  "type": [
203
212
  "null",
204
213
  {