acryl-datahub 1.0.0.3rc5__py3-none-any.whl → 1.0.0.3rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/METADATA +2427 -2427
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/RECORD +18 -16
- datahub/_version.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/sdk/_shared.py +3 -5
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +4 -1
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +2 -6
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.0.0.
|
|
1
|
+
acryl_datahub-1.0.0.3rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=n49MhTfoZh6lQvQMYp45DAwrexBWePxkeHMtc0_gSGk,323
|
|
5
5
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
6
6
|
datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -334,7 +334,7 @@ datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H
|
|
|
334
334
|
datahub/ingestion/source/hex/model.py,sha256=S9bUhfFcjzuio2dBS6HzSyRVPiSJvRvMQ0qyVrjV5-E,1766
|
|
335
335
|
datahub/ingestion/source/hex/query_fetcher.py,sha256=ZaRrja05mK0VlIvpsFi-8-EBoJr0GSLbUxBUjycibIU,12505
|
|
336
336
|
datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
337
|
-
datahub/ingestion/source/iceberg/iceberg.py,sha256
|
|
337
|
+
datahub/ingestion/source/iceberg/iceberg.py,sha256=-8uaBerljvonaT7Gn9Evokq6-SSDiMRf8kKo7Hg1qY4,35414
|
|
338
338
|
datahub/ingestion/source/iceberg/iceberg_common.py,sha256=VGosqYPmn_j6GETSnDHZ8Ay1BVOedmx2x5LHxw16I3A,12278
|
|
339
339
|
datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=9iwp2vpQTi4OMbIKoDZV5lAdvjMR0ls6Llpck9grJIE,9875
|
|
340
340
|
datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -350,7 +350,7 @@ datahub/ingestion/source/kafka_connect/sink_connectors.py,sha256=kEnxOXTik5HSDLj
|
|
|
350
350
|
datahub/ingestion/source/kafka_connect/source_connectors.py,sha256=OQ0vjz9xF0T30pRln_gDvelmaOE5jTAxwsCtm1K4SWM,21080
|
|
351
351
|
datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
352
352
|
datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
|
|
353
|
-
datahub/ingestion/source/looker/looker_common.py,sha256=
|
|
353
|
+
datahub/ingestion/source/looker/looker_common.py,sha256=h1DLGyPCYbFM2QUjgJ8ZK_LDLzu-Q7F5X4VIRZMLweg,62817
|
|
354
354
|
datahub/ingestion/source/looker/looker_config.py,sha256=eVKw1nn9D8hUFdRfNyT3MtzL8w-zWhFeokiwSnNKQuc,13607
|
|
355
355
|
datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
|
|
356
356
|
datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
|
|
@@ -359,7 +359,7 @@ datahub/ingestion/source/looker/looker_file_loader.py,sha256=gb2Z97_w28MsybYe01J
|
|
|
359
359
|
datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=2aEorQ3WjBzYVQIm-5QR2qDGAhpKflstwO5X9oboXS8,11553
|
|
360
360
|
datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
|
|
361
361
|
datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
|
|
362
|
-
datahub/ingestion/source/looker/looker_source.py,sha256=
|
|
362
|
+
datahub/ingestion/source/looker/looker_source.py,sha256=TOOht_7c5PM77DhR-4hP7cPQZfvSuGU4F_xUeXDPfI8,67803
|
|
363
363
|
datahub/ingestion/source/looker/looker_template_language.py,sha256=5fZFPKFP3IYbJg3jLifjaji4wWg8wRy-1XDvc8Qucus,17949
|
|
364
364
|
datahub/ingestion/source/looker/looker_usage.py,sha256=qFBX7OHtIcarYIqFe0jQMrDV8MMPV_nN4PZrZRUznTw,23029
|
|
365
365
|
datahub/ingestion/source/looker/looker_view_id_cache.py,sha256=92gDy6NONhJYBp92z_IBzDVZvezmUIkaBCZY1bdk6mE,4392
|
|
@@ -901,13 +901,14 @@ datahub/pydantic/compat.py,sha256=TUEo4kSEeOWVAhV6LQtst1phrpVgGtK4uif4OI5vQ2M,19
|
|
|
901
901
|
datahub/sdk/__init__.py,sha256=QeutS6Th8K4E4ZxXuoGrmvahN6zA9Oh9asKk5mw9AIk,1670
|
|
902
902
|
datahub/sdk/_all_entities.py,sha256=inbLFv2T7dhZpGfBY5FPhCWbyE0P0G8umOt0Bc7V4XA,520
|
|
903
903
|
datahub/sdk/_attribution.py,sha256=0Trh8steVd27GOr9MKCZeawbuDD2_q3GIsZlCtHqEUg,1321
|
|
904
|
-
datahub/sdk/_shared.py,sha256=
|
|
905
|
-
datahub/sdk/_utils.py,sha256=
|
|
904
|
+
datahub/sdk/_shared.py,sha256=64m4Q7D5QdS-Z9Te0uOFwsZMBZEfwZ__2sZ8uCTHa7w,24770
|
|
905
|
+
datahub/sdk/_utils.py,sha256=oXE2BzsXE5zmSkCP3R1tObD4RHnPeH_ps83D_Dw9JaQ,1169
|
|
906
906
|
datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
|
|
907
|
-
datahub/sdk/dataset.py,sha256=
|
|
907
|
+
datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
|
|
908
908
|
datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
|
|
909
909
|
datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
|
|
910
|
-
datahub/sdk/
|
|
910
|
+
datahub/sdk/lineage_client.py,sha256=1JPxQx5pZYQwnzL_VX3KjAbX2QWdvjSKm8S8Bya2IAc,8617
|
|
911
|
+
datahub/sdk/main_client.py,sha256=EeFZw4ot6L8DP4xHMiy07n7yICTjBn080YtW8ub5mBk,3781
|
|
911
912
|
datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
|
|
912
913
|
datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
|
|
913
914
|
datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
|
|
@@ -936,14 +937,15 @@ datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
|
936
937
|
datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
|
|
937
938
|
datahub/sql_parsing/_sqlglot_patch.py,sha256=dpTiGj4Jxzvbi6HkYiOTbZ81CTFwEEo6JgYOd-dnCMA,6788
|
|
938
939
|
datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
|
|
940
|
+
datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
|
|
939
941
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
940
942
|
datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
|
|
941
943
|
datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
|
|
942
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
944
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=zKxIffMWK8YKIN0A7wzOfVkMGC7Cd_9nB6gWc-6BHuw,70671
|
|
943
945
|
datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
|
|
944
946
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
945
947
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
|
|
946
|
-
datahub/sql_parsing/sqlglot_utils.py,sha256=
|
|
948
|
+
datahub/sql_parsing/sqlglot_utils.py,sha256=TI11oBu1wrGeUuUGBg7hGTr6lTvztahdqiqXNJYRfbQ,14823
|
|
947
949
|
datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
|
|
948
950
|
datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
949
951
|
datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
|
|
@@ -1048,8 +1050,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1048
1050
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1049
1051
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1050
1052
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1051
|
-
acryl_datahub-1.0.0.
|
|
1052
|
-
acryl_datahub-1.0.0.
|
|
1053
|
-
acryl_datahub-1.0.0.
|
|
1054
|
-
acryl_datahub-1.0.0.
|
|
1055
|
-
acryl_datahub-1.0.0.
|
|
1053
|
+
acryl_datahub-1.0.0.3rc7.dist-info/METADATA,sha256=cS9ynN7khxxthJ9wKCRUKd8-_EHBy-cjhxBPx-KqCu0,176989
|
|
1054
|
+
acryl_datahub-1.0.0.3rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
1055
|
+
acryl_datahub-1.0.0.3rc7.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
|
|
1056
|
+
acryl_datahub-1.0.0.3rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1057
|
+
acryl_datahub-1.0.0.3rc7.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
|
|
|
16
16
|
)
|
|
17
17
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
18
18
|
from pyiceberg.table import Table
|
|
19
|
-
from pyiceberg.typedef import Identifier
|
|
19
|
+
from pyiceberg.typedef import Identifier, Properties
|
|
20
20
|
from pyiceberg.types import (
|
|
21
21
|
BinaryType,
|
|
22
22
|
BooleanType,
|
|
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
387
387
|
env=self.config.env,
|
|
388
388
|
)
|
|
389
389
|
)
|
|
390
|
+
namespace_properties: Properties = catalog.load_namespace_properties(
|
|
391
|
+
namespace
|
|
392
|
+
)
|
|
390
393
|
namespaces.append((namespace, namespace_urn))
|
|
391
|
-
for aspect in self._create_iceberg_namespace_aspects(
|
|
394
|
+
for aspect in self._create_iceberg_namespace_aspects(
|
|
395
|
+
namespace, namespace_properties
|
|
396
|
+
):
|
|
392
397
|
yield stamping_processor.stamp_wu(
|
|
393
398
|
MetadataChangeProposalWrapper(
|
|
394
399
|
entityUrn=namespace_urn, aspect=aspect
|
|
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
608
613
|
return self.report
|
|
609
614
|
|
|
610
615
|
def _create_iceberg_namespace_aspects(
|
|
611
|
-
self, namespace: Identifier
|
|
616
|
+
self, namespace: Identifier, properties: Properties
|
|
612
617
|
) -> Iterable[_Aspect]:
|
|
613
618
|
namespace_repr = ".".join(namespace)
|
|
619
|
+
custom_properties: Dict[str, str] = {}
|
|
620
|
+
for k, v in properties.items():
|
|
621
|
+
try:
|
|
622
|
+
custom_properties[str(k)] = str(v)
|
|
623
|
+
except Exception as e:
|
|
624
|
+
LOGGER.warning(
|
|
625
|
+
f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
|
|
626
|
+
)
|
|
614
627
|
yield Status(removed=False)
|
|
615
628
|
yield ContainerProperties(
|
|
616
|
-
name=namespace_repr,
|
|
629
|
+
name=namespace_repr,
|
|
630
|
+
qualifiedName=namespace_repr,
|
|
631
|
+
env=self.config.env,
|
|
632
|
+
customProperties=custom_properties,
|
|
617
633
|
)
|
|
618
634
|
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
619
635
|
dpi = self._get_dataplatform_instance_aspect()
|
|
@@ -471,7 +471,10 @@ def get_view_file_path(
|
|
|
471
471
|
logger.debug("Entered")
|
|
472
472
|
|
|
473
473
|
for field in lkml_fields:
|
|
474
|
-
if
|
|
474
|
+
if (
|
|
475
|
+
LookerUtil.extract_view_name_from_lookml_model_explore_field(field)
|
|
476
|
+
== view_name
|
|
477
|
+
):
|
|
475
478
|
# This path is relative to git clone directory
|
|
476
479
|
logger.debug(f"Found view({view_name}) file-path {field.source_file}")
|
|
477
480
|
return field.source_file
|
|
@@ -1103,7 +1106,7 @@ class LookerExplore:
|
|
|
1103
1106
|
[column_ref] if column_ref is not None else []
|
|
1104
1107
|
)
|
|
1105
1108
|
|
|
1106
|
-
|
|
1109
|
+
looker_explore = cls(
|
|
1107
1110
|
name=explore_name,
|
|
1108
1111
|
model_name=model,
|
|
1109
1112
|
project_name=explore.project_name,
|
|
@@ -1121,6 +1124,8 @@ class LookerExplore:
|
|
|
1121
1124
|
source_file=explore.source_file,
|
|
1122
1125
|
tags=list(explore.tags) if explore.tags is not None else [],
|
|
1123
1126
|
)
|
|
1127
|
+
logger.debug(f"Created LookerExplore from API: {looker_explore}")
|
|
1128
|
+
return looker_explore
|
|
1124
1129
|
except SDKError as e:
|
|
1125
1130
|
if "<title>Looker Not Found (404)</title>" in str(e):
|
|
1126
1131
|
logger.info(
|
|
@@ -1161,6 +1166,9 @@ class LookerExplore:
|
|
|
1161
1166
|
dataset_name = config.explore_naming_pattern.replace_variables(
|
|
1162
1167
|
self.get_mapping(config)
|
|
1163
1168
|
)
|
|
1169
|
+
logger.debug(
|
|
1170
|
+
f"Generated dataset_name={dataset_name} for explore with model_name={self.model_name}, name={self.name}"
|
|
1171
|
+
)
|
|
1164
1172
|
|
|
1165
1173
|
return builder.make_dataset_urn_with_platform_instance(
|
|
1166
1174
|
platform=config.platform_name,
|
|
@@ -1362,6 +1370,7 @@ class LookerExploreRegistry:
|
|
|
1362
1370
|
|
|
1363
1371
|
@lru_cache(maxsize=200)
|
|
1364
1372
|
def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
|
|
1373
|
+
logger.debug(f"Retrieving explore: model={model}, explore={explore}")
|
|
1365
1374
|
looker_explore = LookerExplore.from_api(
|
|
1366
1375
|
model,
|
|
1367
1376
|
explore,
|
|
@@ -1369,6 +1378,12 @@ class LookerExploreRegistry:
|
|
|
1369
1378
|
self.report,
|
|
1370
1379
|
self.source_config,
|
|
1371
1380
|
)
|
|
1381
|
+
if looker_explore is not None:
|
|
1382
|
+
logger.debug(
|
|
1383
|
+
f"Found explore with model_name={looker_explore.model_name}, name={looker_explore.name}"
|
|
1384
|
+
)
|
|
1385
|
+
else:
|
|
1386
|
+
logger.debug(f"No explore found for model={model}, explore={explore}")
|
|
1372
1387
|
return looker_explore
|
|
1373
1388
|
|
|
1374
1389
|
def compute_stats(self) -> Dict:
|
|
@@ -279,6 +279,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
279
279
|
return []
|
|
280
280
|
result = []
|
|
281
281
|
|
|
282
|
+
if query is not None:
|
|
283
|
+
logger.debug(
|
|
284
|
+
f"Processing query: model={query.model}, view={query.view}, input_fields_count={len(query.fields) if query.fields else 0}"
|
|
285
|
+
)
|
|
286
|
+
|
|
282
287
|
# query.dynamic_fields can contain:
|
|
283
288
|
# - looker table calculations: https://docs.looker.com/exploring-data/using-table-calculations
|
|
284
289
|
# - looker custom measures: https://docs.looker.com/de/exploring-data/adding-fields/custom-measure
|
|
@@ -399,9 +404,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
399
404
|
# Get the explore from the view directly
|
|
400
405
|
explores = [element.query.view] if element.query.view is not None else []
|
|
401
406
|
logger.debug(
|
|
402
|
-
f"
|
|
407
|
+
f"Dashboard element {element.title} (ID: {element.id}): Upstream explores added via query={explores} with model={element.query.model}, explore={element.query.view}"
|
|
403
408
|
)
|
|
404
409
|
for exp in explores:
|
|
410
|
+
logger.debug(
|
|
411
|
+
f"Adding reachable explore: model={element.query.model}, explore={exp}, element_id={element.id}, title={element.title}"
|
|
412
|
+
)
|
|
405
413
|
self.add_reachable_explore(
|
|
406
414
|
model=element.query.model,
|
|
407
415
|
explore=exp,
|
|
@@ -477,12 +485,10 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
477
485
|
|
|
478
486
|
# Failing the above two approaches, pick out details from result_maker
|
|
479
487
|
elif element.result_maker is not None:
|
|
480
|
-
model: str = ""
|
|
481
488
|
input_fields = []
|
|
482
489
|
|
|
483
490
|
explores = []
|
|
484
491
|
if element.result_maker.query is not None:
|
|
485
|
-
model = element.result_maker.query.model
|
|
486
492
|
if element.result_maker.query.view is not None:
|
|
487
493
|
explores.append(element.result_maker.query.view)
|
|
488
494
|
input_fields = self._get_input_fields_from_query(
|
|
@@ -502,9 +508,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
502
508
|
|
|
503
509
|
# In addition to the query, filters can point to fields as well
|
|
504
510
|
assert element.result_maker.filterables is not None
|
|
511
|
+
|
|
512
|
+
# Different dashboard elements my reference explores from different models
|
|
513
|
+
# so we need to create a mapping of explore names to their models to maintain correct associations
|
|
514
|
+
explore_to_model_map = {}
|
|
515
|
+
|
|
505
516
|
for filterable in element.result_maker.filterables:
|
|
506
517
|
if filterable.view is not None and filterable.model is not None:
|
|
507
|
-
model
|
|
518
|
+
# Store the model for this view/explore in our mapping
|
|
519
|
+
explore_to_model_map[filterable.view] = filterable.model
|
|
508
520
|
explores.append(filterable.view)
|
|
509
521
|
self.add_reachable_explore(
|
|
510
522
|
model=filterable.model,
|
|
@@ -527,6 +539,18 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
527
539
|
|
|
528
540
|
explores = sorted(list(set(explores))) # dedup the list of views
|
|
529
541
|
|
|
542
|
+
logger.debug(
|
|
543
|
+
f"Dashboard element {element.id} and their explores with the corresponding model: {explore_to_model_map}"
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# If we have a query, use its model as the default for any explores that don't have a model in our mapping
|
|
547
|
+
default_model = ""
|
|
548
|
+
if (
|
|
549
|
+
element.result_maker.query is not None
|
|
550
|
+
and element.result_maker.query.model is not None
|
|
551
|
+
):
|
|
552
|
+
default_model = element.result_maker.query.model
|
|
553
|
+
|
|
530
554
|
return LookerDashboardElement(
|
|
531
555
|
id=element.id,
|
|
532
556
|
title=element.title if element.title is not None else "",
|
|
@@ -540,7 +564,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
540
564
|
else ""
|
|
541
565
|
),
|
|
542
566
|
upstream_explores=[
|
|
543
|
-
LookerExplore(
|
|
567
|
+
LookerExplore(
|
|
568
|
+
model_name=explore_to_model_map.get(exp, default_model),
|
|
569
|
+
name=exp,
|
|
570
|
+
)
|
|
571
|
+
for exp in explores
|
|
544
572
|
],
|
|
545
573
|
input_fields=input_fields,
|
|
546
574
|
owner=None,
|
|
@@ -1270,6 +1298,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1270
1298
|
chart_urn = self._make_chart_urn(
|
|
1271
1299
|
element_id=dashboard_element.get_urn_element_id()
|
|
1272
1300
|
)
|
|
1301
|
+
|
|
1273
1302
|
input_fields_aspect = InputFieldsClass(
|
|
1274
1303
|
fields=self._input_fields_from_dashboard_element(dashboard_element)
|
|
1275
1304
|
)
|
datahub/sdk/_shared.py
CHANGED
|
@@ -41,7 +41,7 @@ from datahub.metadata.urns import (
|
|
|
41
41
|
Urn,
|
|
42
42
|
VersionSetUrn,
|
|
43
43
|
)
|
|
44
|
-
from datahub.sdk._utils import add_list_unique, remove_list_unique
|
|
44
|
+
from datahub.sdk._utils import DEFAULT_ACTOR_URN, add_list_unique, remove_list_unique
|
|
45
45
|
from datahub.sdk.entity import Entity
|
|
46
46
|
from datahub.utilities.urns.error import InvalidUrnError
|
|
47
47
|
|
|
@@ -54,8 +54,6 @@ DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
|
|
|
54
54
|
|
|
55
55
|
ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
|
|
56
56
|
|
|
57
|
-
_DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
|
|
58
|
-
|
|
59
57
|
TrainingMetricsInputType: TypeAlias = Union[
|
|
60
58
|
List[models.MLMetricClass], Dict[str, Optional[str]]
|
|
61
59
|
]
|
|
@@ -475,7 +473,7 @@ class HasTerms(Entity):
|
|
|
475
473
|
def _terms_audit_stamp(self) -> models.AuditStampClass:
|
|
476
474
|
return models.AuditStampClass(
|
|
477
475
|
time=0,
|
|
478
|
-
actor=
|
|
476
|
+
actor=DEFAULT_ACTOR_URN,
|
|
479
477
|
)
|
|
480
478
|
|
|
481
479
|
def set_terms(self, terms: TermsInputType) -> None:
|
|
@@ -563,7 +561,7 @@ class HasInstitutionalMemory(Entity):
|
|
|
563
561
|
def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
|
|
564
562
|
return models.AuditStampClass(
|
|
565
563
|
time=0,
|
|
566
|
-
actor=
|
|
564
|
+
actor=DEFAULT_ACTOR_URN,
|
|
567
565
|
)
|
|
568
566
|
|
|
569
567
|
@classmethod
|
datahub/sdk/_utils.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from typing import Any, Callable, List, Protocol, TypeVar
|
|
2
2
|
|
|
3
3
|
from datahub.errors import ItemNotFoundError
|
|
4
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
5
|
+
|
|
6
|
+
# TODO: Change __ingestion to _ingestion.
|
|
7
|
+
DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
class _SupportsEq(Protocol):
|
datahub/sdk/dataset.py
CHANGED
|
@@ -87,7 +87,7 @@ def _parse_upstream_input(
|
|
|
87
87
|
assert_never(upstream_input)
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
def
|
|
90
|
+
def parse_cll_mapping(
|
|
91
91
|
*,
|
|
92
92
|
upstream: DatasetUrnOrStr,
|
|
93
93
|
downstream: DatasetUrnOrStr,
|
|
@@ -142,7 +142,7 @@ def _parse_upstream_lineage_input(
|
|
|
142
142
|
)
|
|
143
143
|
)
|
|
144
144
|
cll.extend(
|
|
145
|
-
|
|
145
|
+
parse_cll_mapping(
|
|
146
146
|
upstream=dataset_urn,
|
|
147
147
|
downstream=downstream_urn,
|
|
148
148
|
cll_mapping=column_lineage,
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
import datahub.metadata.schema_classes as models
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.errors import SdkUsageError
|
|
10
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
11
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn
|
|
12
|
+
from datahub.sdk._shared import DatasetUrnOrStr
|
|
13
|
+
from datahub.sdk._utils import DEFAULT_ACTOR_URN
|
|
14
|
+
from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
|
|
15
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
16
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
17
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from datahub.sdk.main_client import DataHubClient
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
_empty_audit_stamp = models.AuditStampClass(
|
|
25
|
+
time=0,
|
|
26
|
+
actor=DEFAULT_ACTOR_URN,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LineageClient:
|
|
31
|
+
def __init__(self, client: DataHubClient):
|
|
32
|
+
self._client = client
|
|
33
|
+
|
|
34
|
+
def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
|
|
35
|
+
schema_metadata = self._client._graph.get_aspect(
|
|
36
|
+
str(dataset_urn), SchemaMetadataClass
|
|
37
|
+
)
|
|
38
|
+
if schema_metadata is None:
|
|
39
|
+
return Set()
|
|
40
|
+
|
|
41
|
+
return {field.fieldPath for field in schema_metadata.fields}
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def _get_strict_column_lineage(
|
|
45
|
+
cls,
|
|
46
|
+
upstream_fields: Set[str],
|
|
47
|
+
downstream_fields: Set[str],
|
|
48
|
+
) -> ColumnLineageMapping:
|
|
49
|
+
"""Find matches between upstream and downstream fields with case-insensitive matching."""
|
|
50
|
+
strict_column_lineage: ColumnLineageMapping = {}
|
|
51
|
+
|
|
52
|
+
# Create case-insensitive mapping of upstream fields
|
|
53
|
+
case_insensitive_map = {field.lower(): field for field in upstream_fields}
|
|
54
|
+
|
|
55
|
+
# Match downstream fields using case-insensitive comparison
|
|
56
|
+
for downstream_field in downstream_fields:
|
|
57
|
+
lower_field = downstream_field.lower()
|
|
58
|
+
if lower_field in case_insensitive_map:
|
|
59
|
+
# Use the original case of the upstream field
|
|
60
|
+
strict_column_lineage[downstream_field] = [
|
|
61
|
+
case_insensitive_map[lower_field]
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
return strict_column_lineage
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def _get_fuzzy_column_lineage(
|
|
68
|
+
cls,
|
|
69
|
+
upstream_fields: Set[str],
|
|
70
|
+
downstream_fields: Set[str],
|
|
71
|
+
) -> ColumnLineageMapping:
|
|
72
|
+
"""Generate fuzzy matches between upstream and downstream fields."""
|
|
73
|
+
|
|
74
|
+
# Simple normalization function for better matching
|
|
75
|
+
def normalize(s: str) -> str:
|
|
76
|
+
return s.lower().replace("_", "")
|
|
77
|
+
|
|
78
|
+
# Create normalized lookup for upstream fields
|
|
79
|
+
normalized_upstream = {normalize(field): field for field in upstream_fields}
|
|
80
|
+
|
|
81
|
+
fuzzy_column_lineage = {}
|
|
82
|
+
for downstream_field in downstream_fields:
|
|
83
|
+
# Try exact match first
|
|
84
|
+
if downstream_field in upstream_fields:
|
|
85
|
+
fuzzy_column_lineage[downstream_field] = [downstream_field]
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# Try normalized match
|
|
89
|
+
norm_downstream = normalize(downstream_field)
|
|
90
|
+
if norm_downstream in normalized_upstream:
|
|
91
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
92
|
+
normalized_upstream[norm_downstream]
|
|
93
|
+
]
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# If no direct match, find closest match using similarity
|
|
97
|
+
matches = difflib.get_close_matches(
|
|
98
|
+
norm_downstream,
|
|
99
|
+
normalized_upstream.keys(),
|
|
100
|
+
n=1, # Return only the best match
|
|
101
|
+
cutoff=0.8, # Adjust cutoff for sensitivity
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if matches:
|
|
105
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
106
|
+
normalized_upstream[matches[0]]
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
return fuzzy_column_lineage
|
|
110
|
+
|
|
111
|
+
def add_dataset_copy_lineage(
|
|
112
|
+
self,
|
|
113
|
+
*,
|
|
114
|
+
upstream: DatasetUrnOrStr,
|
|
115
|
+
downstream: DatasetUrnOrStr,
|
|
116
|
+
column_lineage: Union[
|
|
117
|
+
None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
118
|
+
] = "auto_fuzzy",
|
|
119
|
+
) -> None:
|
|
120
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
121
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
122
|
+
|
|
123
|
+
if column_lineage is None:
|
|
124
|
+
cll = None
|
|
125
|
+
elif column_lineage in ["auto_fuzzy", "auto_strict"]:
|
|
126
|
+
upstream_schema = self._get_fields_from_dataset_urn(upstream)
|
|
127
|
+
downstream_schema = self._get_fields_from_dataset_urn(downstream)
|
|
128
|
+
if column_lineage == "auto_fuzzy":
|
|
129
|
+
mapping = self._get_fuzzy_column_lineage(
|
|
130
|
+
upstream_schema, downstream_schema
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
mapping = self._get_strict_column_lineage(
|
|
134
|
+
upstream_schema, downstream_schema
|
|
135
|
+
)
|
|
136
|
+
cll = parse_cll_mapping(
|
|
137
|
+
upstream=upstream,
|
|
138
|
+
downstream=downstream,
|
|
139
|
+
cll_mapping=mapping,
|
|
140
|
+
)
|
|
141
|
+
elif isinstance(column_lineage, dict):
|
|
142
|
+
cll = parse_cll_mapping(
|
|
143
|
+
upstream=upstream,
|
|
144
|
+
downstream=downstream,
|
|
145
|
+
cll_mapping=column_lineage,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
149
|
+
updater.add_upstream_lineage(
|
|
150
|
+
models.UpstreamClass(
|
|
151
|
+
dataset=str(upstream),
|
|
152
|
+
type=models.DatasetLineageTypeClass.COPY,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
for cl in cll or []:
|
|
156
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
157
|
+
|
|
158
|
+
self._client.entities.update(updater)
|
|
159
|
+
|
|
160
|
+
def add_dataset_transform_lineage(
|
|
161
|
+
self,
|
|
162
|
+
*,
|
|
163
|
+
upstream: DatasetUrnOrStr,
|
|
164
|
+
downstream: DatasetUrnOrStr,
|
|
165
|
+
column_lineage: Optional[ColumnLineageMapping] = None,
|
|
166
|
+
query_text: Optional[str] = None,
|
|
167
|
+
) -> None:
|
|
168
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
169
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
170
|
+
|
|
171
|
+
cll = None
|
|
172
|
+
if column_lineage is not None:
|
|
173
|
+
cll = parse_cll_mapping(
|
|
174
|
+
upstream=upstream,
|
|
175
|
+
downstream=downstream,
|
|
176
|
+
cll_mapping=column_lineage,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
fields_involved = OrderedSet([str(upstream), str(downstream)])
|
|
180
|
+
if cll is not None:
|
|
181
|
+
for c in cll:
|
|
182
|
+
for field in c.upstreams or []:
|
|
183
|
+
fields_involved.add(field)
|
|
184
|
+
for field in c.downstreams or []:
|
|
185
|
+
fields_involved.add(field)
|
|
186
|
+
|
|
187
|
+
query_urn = None
|
|
188
|
+
query_entity = None
|
|
189
|
+
if query_text:
|
|
190
|
+
# Eventually we might want to use our regex-based fingerprinting instead.
|
|
191
|
+
fingerprint = generate_hash(query_text)
|
|
192
|
+
query_urn = QueryUrn(fingerprint).urn()
|
|
193
|
+
|
|
194
|
+
from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
|
|
195
|
+
|
|
196
|
+
query_entity = MetadataChangeProposalWrapper.construct_many(
|
|
197
|
+
query_urn,
|
|
198
|
+
aspects=[
|
|
199
|
+
models.QueryPropertiesClass(
|
|
200
|
+
statement=models.QueryStatementClass(
|
|
201
|
+
value=query_text, language=models.QueryLanguageClass.SQL
|
|
202
|
+
),
|
|
203
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
204
|
+
created=_empty_audit_stamp,
|
|
205
|
+
lastModified=_empty_audit_stamp,
|
|
206
|
+
),
|
|
207
|
+
make_query_subjects(list(fields_involved)),
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
212
|
+
updater.add_upstream_lineage(
|
|
213
|
+
models.UpstreamClass(
|
|
214
|
+
dataset=str(upstream),
|
|
215
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
216
|
+
query=query_urn,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
for cl in cll or []:
|
|
220
|
+
cl.query = query_urn
|
|
221
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
222
|
+
|
|
223
|
+
# Throw if the dataset does not exist.
|
|
224
|
+
# We need to manually call .build() instead of reusing client.update()
|
|
225
|
+
# so that we make just one emit_mcps call.
|
|
226
|
+
if not self._client._graph.exists(updater.urn):
|
|
227
|
+
raise SdkUsageError(
|
|
228
|
+
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
229
|
+
)
|
|
230
|
+
mcps: List[
|
|
231
|
+
Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
|
|
232
|
+
] = list(updater.build())
|
|
233
|
+
if query_entity:
|
|
234
|
+
mcps.extend(query_entity)
|
|
235
|
+
self._client._graph.emit_mcps(mcps)
|
datahub/sdk/main_client.py
CHANGED
|
@@ -6,6 +6,7 @@ from datahub.errors import SdkUsageError
|
|
|
6
6
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
7
7
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
8
8
|
from datahub.sdk.entity_client import EntityClient
|
|
9
|
+
from datahub.sdk.lineage_client import LineageClient
|
|
9
10
|
from datahub.sdk.resolver_client import ResolverClient
|
|
10
11
|
from datahub.sdk.search_client import SearchClient
|
|
11
12
|
|
|
@@ -99,4 +100,6 @@ class DataHubClient:
|
|
|
99
100
|
def search(self) -> SearchClient:
|
|
100
101
|
return SearchClient(self)
|
|
101
102
|
|
|
102
|
-
|
|
103
|
+
@property
|
|
104
|
+
def lineage(self) -> LineageClient:
|
|
105
|
+
return LineageClient(self)
|