acryl-datahub 1.0.0.3rc5__py3-none-any.whl → 1.0.0.3rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.3rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.3rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=QrGnvYk4Mo48c9uZMV5srbywzBX4Fcxp27GXbDul8F4,323
4
+ datahub/_version.py,sha256=n49MhTfoZh6lQvQMYp45DAwrexBWePxkeHMtc0_gSGk,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
6
  datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -334,7 +334,7 @@ datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H
334
334
  datahub/ingestion/source/hex/model.py,sha256=S9bUhfFcjzuio2dBS6HzSyRVPiSJvRvMQ0qyVrjV5-E,1766
335
335
  datahub/ingestion/source/hex/query_fetcher.py,sha256=ZaRrja05mK0VlIvpsFi-8-EBoJr0GSLbUxBUjycibIU,12505
336
336
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
337
- datahub/ingestion/source/iceberg/iceberg.py,sha256=s69XzCGD5oV_hqTyvzCt5eLKZVEzVIJo_DiAEDk3p6A,34759
337
+ datahub/ingestion/source/iceberg/iceberg.py,sha256=-8uaBerljvonaT7Gn9Evokq6-SSDiMRf8kKo7Hg1qY4,35414
338
338
  datahub/ingestion/source/iceberg/iceberg_common.py,sha256=VGosqYPmn_j6GETSnDHZ8Ay1BVOedmx2x5LHxw16I3A,12278
339
339
  datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=9iwp2vpQTi4OMbIKoDZV5lAdvjMR0ls6Llpck9grJIE,9875
340
340
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -350,7 +350,7 @@ datahub/ingestion/source/kafka_connect/sink_connectors.py,sha256=kEnxOXTik5HSDLj
350
350
  datahub/ingestion/source/kafka_connect/source_connectors.py,sha256=OQ0vjz9xF0T30pRln_gDvelmaOE5jTAxwsCtm1K4SWM,21080
351
351
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
352
352
  datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
353
- datahub/ingestion/source/looker/looker_common.py,sha256=hR4iufcjwh_iBzIJs9tYR4lwSn6vXk2WF5VFAYepKGM,62094
353
+ datahub/ingestion/source/looker/looker_common.py,sha256=h1DLGyPCYbFM2QUjgJ8ZK_LDLzu-Q7F5X4VIRZMLweg,62817
354
354
  datahub/ingestion/source/looker/looker_config.py,sha256=eVKw1nn9D8hUFdRfNyT3MtzL8w-zWhFeokiwSnNKQuc,13607
355
355
  datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
356
356
  datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
@@ -359,7 +359,7 @@ datahub/ingestion/source/looker/looker_file_loader.py,sha256=gb2Z97_w28MsybYe01J
359
359
  datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=2aEorQ3WjBzYVQIm-5QR2qDGAhpKflstwO5X9oboXS8,11553
360
360
  datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
361
361
  datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
362
- datahub/ingestion/source/looker/looker_source.py,sha256=rMuq0awCCmKJLbxPDCzl6VlrRe7vgRZ9wJ68b6iCNt4,66383
362
+ datahub/ingestion/source/looker/looker_source.py,sha256=TOOht_7c5PM77DhR-4hP7cPQZfvSuGU4F_xUeXDPfI8,67803
363
363
  datahub/ingestion/source/looker/looker_template_language.py,sha256=5fZFPKFP3IYbJg3jLifjaji4wWg8wRy-1XDvc8Qucus,17949
364
364
  datahub/ingestion/source/looker/looker_usage.py,sha256=qFBX7OHtIcarYIqFe0jQMrDV8MMPV_nN4PZrZRUznTw,23029
365
365
  datahub/ingestion/source/looker/looker_view_id_cache.py,sha256=92gDy6NONhJYBp92z_IBzDVZvezmUIkaBCZY1bdk6mE,4392
@@ -901,13 +901,14 @@ datahub/pydantic/compat.py,sha256=TUEo4kSEeOWVAhV6LQtst1phrpVgGtK4uif4OI5vQ2M,19
901
901
  datahub/sdk/__init__.py,sha256=QeutS6Th8K4E4ZxXuoGrmvahN6zA9Oh9asKk5mw9AIk,1670
902
902
  datahub/sdk/_all_entities.py,sha256=inbLFv2T7dhZpGfBY5FPhCWbyE0P0G8umOt0Bc7V4XA,520
903
903
  datahub/sdk/_attribution.py,sha256=0Trh8steVd27GOr9MKCZeawbuDD2_q3GIsZlCtHqEUg,1321
904
- datahub/sdk/_shared.py,sha256=5L1IkihLc7Pd2x0ypDs96kZ8ecm6o0-UZEn0J1Sffqw,24808
905
- datahub/sdk/_utils.py,sha256=aGE665Su8SGtj2CRDiTaXNYrJ8ADBsS0m4DmaXw79b8,1027
904
+ datahub/sdk/_shared.py,sha256=64m4Q7D5QdS-Z9Te0uOFwsZMBZEfwZ__2sZ8uCTHa7w,24770
905
+ datahub/sdk/_utils.py,sha256=oXE2BzsXE5zmSkCP3R1tObD4RHnPeH_ps83D_Dw9JaQ,1169
906
906
  datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
907
- datahub/sdk/dataset.py,sha256=5LG4c_8bHeSPYrW88KNXRgiPD8frBjR0OBVrrwdquU4,29152
907
+ datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
908
908
  datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
909
909
  datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
910
- datahub/sdk/main_client.py,sha256=h2MKRhR-BO0zGCMhF7z2bTncX4hagKrAYwR3wTNTtzA,3666
910
+ datahub/sdk/lineage_client.py,sha256=1JPxQx5pZYQwnzL_VX3KjAbX2QWdvjSKm8S8Bya2IAc,8617
911
+ datahub/sdk/main_client.py,sha256=EeFZw4ot6L8DP4xHMiy07n7yICTjBn080YtW8ub5mBk,3781
911
912
  datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
912
913
  datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
913
914
  datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
@@ -936,14 +937,15 @@ datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
936
937
  datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
937
938
  datahub/sql_parsing/_sqlglot_patch.py,sha256=dpTiGj4Jxzvbi6HkYiOTbZ81CTFwEEo6JgYOd-dnCMA,6788
938
939
  datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
940
+ datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
939
941
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
940
942
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
941
943
  datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
942
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42XXx5UyTEK9PkekIs,70317
944
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=zKxIffMWK8YKIN0A7wzOfVkMGC7Cd_9nB6gWc-6BHuw,70671
943
945
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
944
946
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
945
947
  datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
946
- datahub/sql_parsing/sqlglot_utils.py,sha256=5cUiEWLWfVTI7uIxolAfOfNVo50qnklzhj86gxSFWqg,14943
948
+ datahub/sql_parsing/sqlglot_utils.py,sha256=TI11oBu1wrGeUuUGBg7hGTr6lTvztahdqiqXNJYRfbQ,14823
947
949
  datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
948
950
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
949
951
  datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
@@ -1048,8 +1050,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1048
1050
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1049
1051
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1050
1052
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1051
- acryl_datahub-1.0.0.3rc5.dist-info/METADATA,sha256=4yRDLa5bvQ1tAjJ2uzc4mKVmegeKOASmLpseMx9A1K8,176989
1052
- acryl_datahub-1.0.0.3rc5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1053
- acryl_datahub-1.0.0.3rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1054
- acryl_datahub-1.0.0.3rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1055
- acryl_datahub-1.0.0.3rc5.dist-info/RECORD,,
1053
+ acryl_datahub-1.0.0.3rc7.dist-info/METADATA,sha256=cS9ynN7khxxthJ9wKCRUKd8-_EHBy-cjhxBPx-KqCu0,176989
1054
+ acryl_datahub-1.0.0.3rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1055
+ acryl_datahub-1.0.0.3rc7.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1056
+ acryl_datahub-1.0.0.3rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1057
+ acryl_datahub-1.0.0.3rc7.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.3rc5"
3
+ __version__ = "1.0.0.3rc7"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
16
16
  )
17
17
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
18
18
  from pyiceberg.table import Table
19
- from pyiceberg.typedef import Identifier
19
+ from pyiceberg.typedef import Identifier, Properties
20
20
  from pyiceberg.types import (
21
21
  BinaryType,
22
22
  BooleanType,
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
387
387
  env=self.config.env,
388
388
  )
389
389
  )
390
+ namespace_properties: Properties = catalog.load_namespace_properties(
391
+ namespace
392
+ )
390
393
  namespaces.append((namespace, namespace_urn))
391
- for aspect in self._create_iceberg_namespace_aspects(namespace):
394
+ for aspect in self._create_iceberg_namespace_aspects(
395
+ namespace, namespace_properties
396
+ ):
392
397
  yield stamping_processor.stamp_wu(
393
398
  MetadataChangeProposalWrapper(
394
399
  entityUrn=namespace_urn, aspect=aspect
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
608
613
  return self.report
609
614
 
610
615
  def _create_iceberg_namespace_aspects(
611
- self, namespace: Identifier
616
+ self, namespace: Identifier, properties: Properties
612
617
  ) -> Iterable[_Aspect]:
613
618
  namespace_repr = ".".join(namespace)
619
+ custom_properties: Dict[str, str] = {}
620
+ for k, v in properties.items():
621
+ try:
622
+ custom_properties[str(k)] = str(v)
623
+ except Exception as e:
624
+ LOGGER.warning(
625
+ f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
626
+ )
614
627
  yield Status(removed=False)
615
628
  yield ContainerProperties(
616
- name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
629
+ name=namespace_repr,
630
+ qualifiedName=namespace_repr,
631
+ env=self.config.env,
632
+ customProperties=custom_properties,
617
633
  )
618
634
  yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
619
635
  dpi = self._get_dataplatform_instance_aspect()
@@ -471,7 +471,10 @@ def get_view_file_path(
471
471
  logger.debug("Entered")
472
472
 
473
473
  for field in lkml_fields:
474
- if field.view == view_name:
474
+ if (
475
+ LookerUtil.extract_view_name_from_lookml_model_explore_field(field)
476
+ == view_name
477
+ ):
475
478
  # This path is relative to git clone directory
476
479
  logger.debug(f"Found view({view_name}) file-path {field.source_file}")
477
480
  return field.source_file
@@ -1103,7 +1106,7 @@ class LookerExplore:
1103
1106
  [column_ref] if column_ref is not None else []
1104
1107
  )
1105
1108
 
1106
- return cls(
1109
+ looker_explore = cls(
1107
1110
  name=explore_name,
1108
1111
  model_name=model,
1109
1112
  project_name=explore.project_name,
@@ -1121,6 +1124,8 @@ class LookerExplore:
1121
1124
  source_file=explore.source_file,
1122
1125
  tags=list(explore.tags) if explore.tags is not None else [],
1123
1126
  )
1127
+ logger.debug(f"Created LookerExplore from API: {looker_explore}")
1128
+ return looker_explore
1124
1129
  except SDKError as e:
1125
1130
  if "<title>Looker Not Found (404)</title>" in str(e):
1126
1131
  logger.info(
@@ -1161,6 +1166,9 @@ class LookerExplore:
1161
1166
  dataset_name = config.explore_naming_pattern.replace_variables(
1162
1167
  self.get_mapping(config)
1163
1168
  )
1169
+ logger.debug(
1170
+ f"Generated dataset_name={dataset_name} for explore with model_name={self.model_name}, name={self.name}"
1171
+ )
1164
1172
 
1165
1173
  return builder.make_dataset_urn_with_platform_instance(
1166
1174
  platform=config.platform_name,
@@ -1362,6 +1370,7 @@ class LookerExploreRegistry:
1362
1370
 
1363
1371
  @lru_cache(maxsize=200)
1364
1372
  def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
1373
+ logger.debug(f"Retrieving explore: model={model}, explore={explore}")
1365
1374
  looker_explore = LookerExplore.from_api(
1366
1375
  model,
1367
1376
  explore,
@@ -1369,6 +1378,12 @@ class LookerExploreRegistry:
1369
1378
  self.report,
1370
1379
  self.source_config,
1371
1380
  )
1381
+ if looker_explore is not None:
1382
+ logger.debug(
1383
+ f"Found explore with model_name={looker_explore.model_name}, name={looker_explore.name}"
1384
+ )
1385
+ else:
1386
+ logger.debug(f"No explore found for model={model}, explore={explore}")
1372
1387
  return looker_explore
1373
1388
 
1374
1389
  def compute_stats(self) -> Dict:
@@ -279,6 +279,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
279
279
  return []
280
280
  result = []
281
281
 
282
+ if query is not None:
283
+ logger.debug(
284
+ f"Processing query: model={query.model}, view={query.view}, input_fields_count={len(query.fields) if query.fields else 0}"
285
+ )
286
+
282
287
  # query.dynamic_fields can contain:
283
288
  # - looker table calculations: https://docs.looker.com/exploring-data/using-table-calculations
284
289
  # - looker custom measures: https://docs.looker.com/de/exploring-data/adding-fields/custom-measure
@@ -399,9 +404,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
399
404
  # Get the explore from the view directly
400
405
  explores = [element.query.view] if element.query.view is not None else []
401
406
  logger.debug(
402
- f"Element {element.title}: Explores added via query: {explores}"
407
+ f"Dashboard element {element.title} (ID: {element.id}): Upstream explores added via query={explores} with model={element.query.model}, explore={element.query.view}"
403
408
  )
404
409
  for exp in explores:
410
+ logger.debug(
411
+ f"Adding reachable explore: model={element.query.model}, explore={exp}, element_id={element.id}, title={element.title}"
412
+ )
405
413
  self.add_reachable_explore(
406
414
  model=element.query.model,
407
415
  explore=exp,
@@ -477,12 +485,10 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
477
485
 
478
486
  # Failing the above two approaches, pick out details from result_maker
479
487
  elif element.result_maker is not None:
480
- model: str = ""
481
488
  input_fields = []
482
489
 
483
490
  explores = []
484
491
  if element.result_maker.query is not None:
485
- model = element.result_maker.query.model
486
492
  if element.result_maker.query.view is not None:
487
493
  explores.append(element.result_maker.query.view)
488
494
  input_fields = self._get_input_fields_from_query(
@@ -502,9 +508,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
502
508
 
503
509
  # In addition to the query, filters can point to fields as well
504
510
  assert element.result_maker.filterables is not None
511
+
512
+ # Different dashboard elements my reference explores from different models
513
+ # so we need to create a mapping of explore names to their models to maintain correct associations
514
+ explore_to_model_map = {}
515
+
505
516
  for filterable in element.result_maker.filterables:
506
517
  if filterable.view is not None and filterable.model is not None:
507
- model = filterable.model
518
+ # Store the model for this view/explore in our mapping
519
+ explore_to_model_map[filterable.view] = filterable.model
508
520
  explores.append(filterable.view)
509
521
  self.add_reachable_explore(
510
522
  model=filterable.model,
@@ -527,6 +539,18 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
527
539
 
528
540
  explores = sorted(list(set(explores))) # dedup the list of views
529
541
 
542
+ logger.debug(
543
+ f"Dashboard element {element.id} and their explores with the corresponding model: {explore_to_model_map}"
544
+ )
545
+
546
+ # If we have a query, use its model as the default for any explores that don't have a model in our mapping
547
+ default_model = ""
548
+ if (
549
+ element.result_maker.query is not None
550
+ and element.result_maker.query.model is not None
551
+ ):
552
+ default_model = element.result_maker.query.model
553
+
530
554
  return LookerDashboardElement(
531
555
  id=element.id,
532
556
  title=element.title if element.title is not None else "",
@@ -540,7 +564,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
540
564
  else ""
541
565
  ),
542
566
  upstream_explores=[
543
- LookerExplore(model_name=model, name=exp) for exp in explores
567
+ LookerExplore(
568
+ model_name=explore_to_model_map.get(exp, default_model),
569
+ name=exp,
570
+ )
571
+ for exp in explores
544
572
  ],
545
573
  input_fields=input_fields,
546
574
  owner=None,
@@ -1270,6 +1298,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1270
1298
  chart_urn = self._make_chart_urn(
1271
1299
  element_id=dashboard_element.get_urn_element_id()
1272
1300
  )
1301
+
1273
1302
  input_fields_aspect = InputFieldsClass(
1274
1303
  fields=self._input_fields_from_dashboard_element(dashboard_element)
1275
1304
  )
datahub/sdk/_shared.py CHANGED
@@ -41,7 +41,7 @@ from datahub.metadata.urns import (
41
41
  Urn,
42
42
  VersionSetUrn,
43
43
  )
44
- from datahub.sdk._utils import add_list_unique, remove_list_unique
44
+ from datahub.sdk._utils import DEFAULT_ACTOR_URN, add_list_unique, remove_list_unique
45
45
  from datahub.sdk.entity import Entity
46
46
  from datahub.utilities.urns.error import InvalidUrnError
47
47
 
@@ -54,8 +54,6 @@ DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
54
54
 
55
55
  ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
56
56
 
57
- _DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
58
-
59
57
  TrainingMetricsInputType: TypeAlias = Union[
60
58
  List[models.MLMetricClass], Dict[str, Optional[str]]
61
59
  ]
@@ -475,7 +473,7 @@ class HasTerms(Entity):
475
473
  def _terms_audit_stamp(self) -> models.AuditStampClass:
476
474
  return models.AuditStampClass(
477
475
  time=0,
478
- actor=_DEFAULT_ACTOR_URN,
476
+ actor=DEFAULT_ACTOR_URN,
479
477
  )
480
478
 
481
479
  def set_terms(self, terms: TermsInputType) -> None:
@@ -563,7 +561,7 @@ class HasInstitutionalMemory(Entity):
563
561
  def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
564
562
  return models.AuditStampClass(
565
563
  time=0,
566
- actor=_DEFAULT_ACTOR_URN,
564
+ actor=DEFAULT_ACTOR_URN,
567
565
  )
568
566
 
569
567
  @classmethod
datahub/sdk/_utils.py CHANGED
@@ -1,6 +1,10 @@
1
1
  from typing import Any, Callable, List, Protocol, TypeVar
2
2
 
3
3
  from datahub.errors import ItemNotFoundError
4
+ from datahub.metadata.urns import CorpUserUrn
5
+
6
+ # TODO: Change __ingestion to _ingestion.
7
+ DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
4
8
 
5
9
 
6
10
  class _SupportsEq(Protocol):
datahub/sdk/dataset.py CHANGED
@@ -87,7 +87,7 @@ def _parse_upstream_input(
87
87
  assert_never(upstream_input)
88
88
 
89
89
 
90
- def _parse_cll_mapping(
90
+ def parse_cll_mapping(
91
91
  *,
92
92
  upstream: DatasetUrnOrStr,
93
93
  downstream: DatasetUrnOrStr,
@@ -142,7 +142,7 @@ def _parse_upstream_lineage_input(
142
142
  )
143
143
  )
144
144
  cll.extend(
145
- _parse_cll_mapping(
145
+ parse_cll_mapping(
146
146
  upstream=dataset_urn,
147
147
  downstream=downstream_urn,
148
148
  cll_mapping=column_lineage,
@@ -0,0 +1,235 @@
1
+ from __future__ import annotations
2
+
3
+ import difflib
4
+ import logging
5
+ from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
6
+
7
+ import datahub.metadata.schema_classes as models
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.errors import SdkUsageError
10
+ from datahub.metadata.schema_classes import SchemaMetadataClass
11
+ from datahub.metadata.urns import DatasetUrn, QueryUrn
12
+ from datahub.sdk._shared import DatasetUrnOrStr
13
+ from datahub.sdk._utils import DEFAULT_ACTOR_URN
14
+ from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
15
+ from datahub.specific.dataset import DatasetPatchBuilder
16
+ from datahub.sql_parsing.fingerprint_utils import generate_hash
17
+ from datahub.utilities.ordered_set import OrderedSet
18
+
19
+ if TYPE_CHECKING:
20
+ from datahub.sdk.main_client import DataHubClient
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ _empty_audit_stamp = models.AuditStampClass(
25
+ time=0,
26
+ actor=DEFAULT_ACTOR_URN,
27
+ )
28
+
29
+
30
+ class LineageClient:
31
+ def __init__(self, client: DataHubClient):
32
+ self._client = client
33
+
34
+ def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
35
+ schema_metadata = self._client._graph.get_aspect(
36
+ str(dataset_urn), SchemaMetadataClass
37
+ )
38
+ if schema_metadata is None:
39
+ return Set()
40
+
41
+ return {field.fieldPath for field in schema_metadata.fields}
42
+
43
+ @classmethod
44
+ def _get_strict_column_lineage(
45
+ cls,
46
+ upstream_fields: Set[str],
47
+ downstream_fields: Set[str],
48
+ ) -> ColumnLineageMapping:
49
+ """Find matches between upstream and downstream fields with case-insensitive matching."""
50
+ strict_column_lineage: ColumnLineageMapping = {}
51
+
52
+ # Create case-insensitive mapping of upstream fields
53
+ case_insensitive_map = {field.lower(): field for field in upstream_fields}
54
+
55
+ # Match downstream fields using case-insensitive comparison
56
+ for downstream_field in downstream_fields:
57
+ lower_field = downstream_field.lower()
58
+ if lower_field in case_insensitive_map:
59
+ # Use the original case of the upstream field
60
+ strict_column_lineage[downstream_field] = [
61
+ case_insensitive_map[lower_field]
62
+ ]
63
+
64
+ return strict_column_lineage
65
+
66
+ @classmethod
67
+ def _get_fuzzy_column_lineage(
68
+ cls,
69
+ upstream_fields: Set[str],
70
+ downstream_fields: Set[str],
71
+ ) -> ColumnLineageMapping:
72
+ """Generate fuzzy matches between upstream and downstream fields."""
73
+
74
+ # Simple normalization function for better matching
75
+ def normalize(s: str) -> str:
76
+ return s.lower().replace("_", "")
77
+
78
+ # Create normalized lookup for upstream fields
79
+ normalized_upstream = {normalize(field): field for field in upstream_fields}
80
+
81
+ fuzzy_column_lineage = {}
82
+ for downstream_field in downstream_fields:
83
+ # Try exact match first
84
+ if downstream_field in upstream_fields:
85
+ fuzzy_column_lineage[downstream_field] = [downstream_field]
86
+ continue
87
+
88
+ # Try normalized match
89
+ norm_downstream = normalize(downstream_field)
90
+ if norm_downstream in normalized_upstream:
91
+ fuzzy_column_lineage[downstream_field] = [
92
+ normalized_upstream[norm_downstream]
93
+ ]
94
+ continue
95
+
96
+ # If no direct match, find closest match using similarity
97
+ matches = difflib.get_close_matches(
98
+ norm_downstream,
99
+ normalized_upstream.keys(),
100
+ n=1, # Return only the best match
101
+ cutoff=0.8, # Adjust cutoff for sensitivity
102
+ )
103
+
104
+ if matches:
105
+ fuzzy_column_lineage[downstream_field] = [
106
+ normalized_upstream[matches[0]]
107
+ ]
108
+
109
+ return fuzzy_column_lineage
110
+
111
+ def add_dataset_copy_lineage(
112
+ self,
113
+ *,
114
+ upstream: DatasetUrnOrStr,
115
+ downstream: DatasetUrnOrStr,
116
+ column_lineage: Union[
117
+ None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
118
+ ] = "auto_fuzzy",
119
+ ) -> None:
120
+ upstream = DatasetUrn.from_string(upstream)
121
+ downstream = DatasetUrn.from_string(downstream)
122
+
123
+ if column_lineage is None:
124
+ cll = None
125
+ elif column_lineage in ["auto_fuzzy", "auto_strict"]:
126
+ upstream_schema = self._get_fields_from_dataset_urn(upstream)
127
+ downstream_schema = self._get_fields_from_dataset_urn(downstream)
128
+ if column_lineage == "auto_fuzzy":
129
+ mapping = self._get_fuzzy_column_lineage(
130
+ upstream_schema, downstream_schema
131
+ )
132
+ else:
133
+ mapping = self._get_strict_column_lineage(
134
+ upstream_schema, downstream_schema
135
+ )
136
+ cll = parse_cll_mapping(
137
+ upstream=upstream,
138
+ downstream=downstream,
139
+ cll_mapping=mapping,
140
+ )
141
+ elif isinstance(column_lineage, dict):
142
+ cll = parse_cll_mapping(
143
+ upstream=upstream,
144
+ downstream=downstream,
145
+ cll_mapping=column_lineage,
146
+ )
147
+
148
+ updater = DatasetPatchBuilder(str(downstream))
149
+ updater.add_upstream_lineage(
150
+ models.UpstreamClass(
151
+ dataset=str(upstream),
152
+ type=models.DatasetLineageTypeClass.COPY,
153
+ )
154
+ )
155
+ for cl in cll or []:
156
+ updater.add_fine_grained_upstream_lineage(cl)
157
+
158
+ self._client.entities.update(updater)
159
+
160
+ def add_dataset_transform_lineage(
161
+ self,
162
+ *,
163
+ upstream: DatasetUrnOrStr,
164
+ downstream: DatasetUrnOrStr,
165
+ column_lineage: Optional[ColumnLineageMapping] = None,
166
+ query_text: Optional[str] = None,
167
+ ) -> None:
168
+ upstream = DatasetUrn.from_string(upstream)
169
+ downstream = DatasetUrn.from_string(downstream)
170
+
171
+ cll = None
172
+ if column_lineage is not None:
173
+ cll = parse_cll_mapping(
174
+ upstream=upstream,
175
+ downstream=downstream,
176
+ cll_mapping=column_lineage,
177
+ )
178
+
179
+ fields_involved = OrderedSet([str(upstream), str(downstream)])
180
+ if cll is not None:
181
+ for c in cll:
182
+ for field in c.upstreams or []:
183
+ fields_involved.add(field)
184
+ for field in c.downstreams or []:
185
+ fields_involved.add(field)
186
+
187
+ query_urn = None
188
+ query_entity = None
189
+ if query_text:
190
+ # Eventually we might want to use our regex-based fingerprinting instead.
191
+ fingerprint = generate_hash(query_text)
192
+ query_urn = QueryUrn(fingerprint).urn()
193
+
194
+ from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
195
+
196
+ query_entity = MetadataChangeProposalWrapper.construct_many(
197
+ query_urn,
198
+ aspects=[
199
+ models.QueryPropertiesClass(
200
+ statement=models.QueryStatementClass(
201
+ value=query_text, language=models.QueryLanguageClass.SQL
202
+ ),
203
+ source=models.QuerySourceClass.SYSTEM,
204
+ created=_empty_audit_stamp,
205
+ lastModified=_empty_audit_stamp,
206
+ ),
207
+ make_query_subjects(list(fields_involved)),
208
+ ],
209
+ )
210
+
211
+ updater = DatasetPatchBuilder(str(downstream))
212
+ updater.add_upstream_lineage(
213
+ models.UpstreamClass(
214
+ dataset=str(upstream),
215
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
216
+ query=query_urn,
217
+ )
218
+ )
219
+ for cl in cll or []:
220
+ cl.query = query_urn
221
+ updater.add_fine_grained_upstream_lineage(cl)
222
+
223
+ # Throw if the dataset does not exist.
224
+ # We need to manually call .build() instead of reusing client.update()
225
+ # so that we make just one emit_mcps call.
226
+ if not self._client._graph.exists(updater.urn):
227
+ raise SdkUsageError(
228
+ f"Dataset {updater.urn} does not exist, and hence cannot be updated."
229
+ )
230
+ mcps: List[
231
+ Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
232
+ ] = list(updater.build())
233
+ if query_entity:
234
+ mcps.extend(query_entity)
235
+ self._client._graph.emit_mcps(mcps)
@@ -6,6 +6,7 @@ from datahub.errors import SdkUsageError
6
6
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
7
7
  from datahub.ingestion.graph.config import DatahubClientConfig
8
8
  from datahub.sdk.entity_client import EntityClient
9
+ from datahub.sdk.lineage_client import LineageClient
9
10
  from datahub.sdk.resolver_client import ResolverClient
10
11
  from datahub.sdk.search_client import SearchClient
11
12
 
@@ -99,4 +100,6 @@ class DataHubClient:
99
100
  def search(self) -> SearchClient:
100
101
  return SearchClient(self)
101
102
 
102
- # TODO: lineage client
103
+ @property
104
+ def lineage(self) -> LineageClient:
105
+ return LineageClient(self)
@@ -0,0 +1,6 @@
1
+ import hashlib
2
+
3
+
4
+ def generate_hash(text: str) -> str:
5
+ # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
6
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()