PyPI - acryl-datahub - Versions diffs - 1.0.0.3rc5__py3-none-any.whl → 1.0.0.3rc7__py3-none-any.whl - Mend

acryl-datahub 1.0.0.3rc5py3-none-any.whl → 1.0.0.3rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (18) hide show

{acryl_datahub-1.0.0.3rc5.dist-info → acryl_datahub-1.0.0.3rc7.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-acryl_datahub-1.0.0.3rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
+acryl_datahub-1.0.0.3rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
 datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
-datahub/_version.py,sha256=QrGnvYk4Mo48c9uZMV5srbywzBX4Fcxp27GXbDul8F4,323
+datahub/_version.py,sha256=n49MhTfoZh6lQvQMYp45DAwrexBWePxkeHMtc0_gSGk,323
 datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
 datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -334,7 +334,7 @@ datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H
 datahub/ingestion/source/hex/model.py,sha256=S9bUhfFcjzuio2dBS6HzSyRVPiSJvRvMQ0qyVrjV5-E,1766
 datahub/ingestion/source/hex/query_fetcher.py,sha256=ZaRrja05mK0VlIvpsFi-8-EBoJr0GSLbUxBUjycibIU,12505
 datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/iceberg/iceberg.py,sha256=s69XzCGD5oV_hqTyvzCt5eLKZVEzVIJo_DiAEDk3p6A,34759
+datahub/ingestion/source/iceberg/iceberg.py,sha256=-8uaBerljvonaT7Gn9Evokq6-SSDiMRf8kKo7Hg1qY4,35414
 datahub/ingestion/source/iceberg/iceberg_common.py,sha256=VGosqYPmn_j6GETSnDHZ8Ay1BVOedmx2x5LHxw16I3A,12278
 datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=9iwp2vpQTi4OMbIKoDZV5lAdvjMR0ls6Llpck9grJIE,9875
 datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -350,7 +350,7 @@ datahub/ingestion/source/kafka_connect/sink_connectors.py,sha256=kEnxOXTik5HSDLj
 datahub/ingestion/source/kafka_connect/source_connectors.py,sha256=OQ0vjz9xF0T30pRln_gDvelmaOE5jTAxwsCtm1K4SWM,21080
 datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
-datahub/ingestion/source/looker/looker_common.py,sha256=hR4iufcjwh_iBzIJs9tYR4lwSn6vXk2WF5VFAYepKGM,62094
+datahub/ingestion/source/looker/looker_common.py,sha256=h1DLGyPCYbFM2QUjgJ8ZK_LDLzu-Q7F5X4VIRZMLweg,62817
 datahub/ingestion/source/looker/looker_config.py,sha256=eVKw1nn9D8hUFdRfNyT3MtzL8w-zWhFeokiwSnNKQuc,13607
 datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
 datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
@@ -359,7 +359,7 @@ datahub/ingestion/source/looker/looker_file_loader.py,sha256=gb2Z97_w28MsybYe01J
 datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=2aEorQ3WjBzYVQIm-5QR2qDGAhpKflstwO5X9oboXS8,11553
 datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
 datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
-datahub/ingestion/source/looker/looker_source.py,sha256=rMuq0awCCmKJLbxPDCzl6VlrRe7vgRZ9wJ68b6iCNt4,66383
+datahub/ingestion/source/looker/looker_source.py,sha256=TOOht_7c5PM77DhR-4hP7cPQZfvSuGU4F_xUeXDPfI8,67803
 datahub/ingestion/source/looker/looker_template_language.py,sha256=5fZFPKFP3IYbJg3jLifjaji4wWg8wRy-1XDvc8Qucus,17949
 datahub/ingestion/source/looker/looker_usage.py,sha256=qFBX7OHtIcarYIqFe0jQMrDV8MMPV_nN4PZrZRUznTw,23029
 datahub/ingestion/source/looker/looker_view_id_cache.py,sha256=92gDy6NONhJYBp92z_IBzDVZvezmUIkaBCZY1bdk6mE,4392
@@ -901,13 +901,14 @@ datahub/pydantic/compat.py,sha256=TUEo4kSEeOWVAhV6LQtst1phrpVgGtK4uif4OI5vQ2M,19
 datahub/sdk/__init__.py,sha256=QeutS6Th8K4E4ZxXuoGrmvahN6zA9Oh9asKk5mw9AIk,1670
 datahub/sdk/_all_entities.py,sha256=inbLFv2T7dhZpGfBY5FPhCWbyE0P0G8umOt0Bc7V4XA,520
 datahub/sdk/_attribution.py,sha256=0Trh8steVd27GOr9MKCZeawbuDD2_q3GIsZlCtHqEUg,1321
-datahub/sdk/_shared.py,sha256=5L1IkihLc7Pd2x0ypDs96kZ8ecm6o0-UZEn0J1Sffqw,24808
-datahub/sdk/_utils.py,sha256=aGE665Su8SGtj2CRDiTaXNYrJ8ADBsS0m4DmaXw79b8,1027
+datahub/sdk/_shared.py,sha256=64m4Q7D5QdS-Z9Te0uOFwsZMBZEfwZ__2sZ8uCTHa7w,24770
+datahub/sdk/_utils.py,sha256=oXE2BzsXE5zmSkCP3R1tObD4RHnPeH_ps83D_Dw9JaQ,1169
 datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
-datahub/sdk/dataset.py,sha256=5LG4c_8bHeSPYrW88KNXRgiPD8frBjR0OBVrrwdquU4,29152
+datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
 datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
 datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
-datahub/sdk/main_client.py,sha256=h2MKRhR-BO0zGCMhF7z2bTncX4hagKrAYwR3wTNTtzA,3666
+datahub/sdk/lineage_client.py,sha256=1JPxQx5pZYQwnzL_VX3KjAbX2QWdvjSKm8S8Bya2IAc,8617
+datahub/sdk/main_client.py,sha256=EeFZw4ot6L8DP4xHMiy07n7yICTjBn080YtW8ub5mBk,3781
 datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
 datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
 datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
@@ -936,14 +937,15 @@ datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
 datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
 datahub/sql_parsing/_sqlglot_patch.py,sha256=dpTiGj4Jxzvbi6HkYiOTbZ81CTFwEEo6JgYOd-dnCMA,6788
 datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
+datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
 datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
 datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
 datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
-datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42XXx5UyTEK9PkekIs,70317
+datahub/sql_parsing/sql_parsing_aggregator.py,sha256=zKxIffMWK8YKIN0A7wzOfVkMGC7Cd_9nB6gWc-6BHuw,70671
 datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
 datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
 datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
-datahub/sql_parsing/sqlglot_utils.py,sha256=5cUiEWLWfVTI7uIxolAfOfNVo50qnklzhj86gxSFWqg,14943
+datahub/sql_parsing/sqlglot_utils.py,sha256=TI11oBu1wrGeUuUGBg7hGTr6lTvztahdqiqXNJYRfbQ,14823
 datahub/sql_parsing/tool_meta_extractor.py,sha256=EV_g7sOchTSUm2p6wluNJqND7-rDYokVTqqFCM7hQ6c,7599
 datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
@@ -1048,8 +1050,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-1.0.0.3rc5.dist-info/METADATA,sha256=4yRDLa5bvQ1tAjJ2uzc4mKVmegeKOASmLpseMx9A1K8,176989
-acryl_datahub-1.0.0.3rc5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-acryl_datahub-1.0.0.3rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
-acryl_datahub-1.0.0.3rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-1.0.0.3rc5.dist-info/RECORD,,
+acryl_datahub-1.0.0.3rc7.dist-info/METADATA,sha256=cS9ynN7khxxthJ9wKCRUKd8-_EHBy-cjhxBPx-KqCu0,176989
+acryl_datahub-1.0.0.3rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+acryl_datahub-1.0.0.3rc7.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
+acryl_datahub-1.0.0.3rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-1.0.0.3rc7.dist-info/RECORD,,

datahub/_version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "1.0.0.3rc5"
+__version__ = "1.0.0.3rc7"
 def is_dev_mode() -> bool:

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -16,7 +16,7 @@ from pyiceberg.exceptions import (
 )
 from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
 from pyiceberg.table import Table
-from pyiceberg.typedef import Identifier
+from pyiceberg.typedef import Identifier, Properties
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -387,8 +387,13 @@ class IcebergSource(StatefulIngestionSourceBase):
                         env=self.config.env,
                     )
                 )
+                namespace_properties: Properties = catalog.load_namespace_properties(
+                    namespace
+                )
                 namespaces.append((namespace, namespace_urn))
-                for aspect in self._create_iceberg_namespace_aspects(namespace):
+                for aspect in self._create_iceberg_namespace_aspects(
+                    namespace, namespace_properties
+                ):
                     yield stamping_processor.stamp_wu(
                         MetadataChangeProposalWrapper(
                             entityUrn=namespace_urn, aspect=aspect
@@ -608,12 +613,23 @@ class IcebergSource(StatefulIngestionSourceBase):
         return self.report
     def _create_iceberg_namespace_aspects(
-        self, namespace: Identifier
+        self, namespace: Identifier, properties: Properties
     ) -> Iterable[_Aspect]:
         namespace_repr = ".".join(namespace)
+        custom_properties: Dict[str, str] = {}
+        for k, v in properties.items():
+            try:
+                custom_properties[str(k)] = str(v)
+            except Exception as e:
+                LOGGER.warning(
+                    f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
+                )
         yield Status(removed=False)
         yield ContainerProperties(
-            name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
+            name=namespace_repr,
+            qualifiedName=namespace_repr,
+            env=self.config.env,
+            customProperties=custom_properties,
         )
         yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
         dpi = self._get_dataplatform_instance_aspect()

datahub/ingestion/source/looker/looker_common.py CHANGED Viewed

@@ -471,7 +471,10 @@ def get_view_file_path(
     logger.debug("Entered")
     for field in lkml_fields:
-        if field.view == view_name:
+        if (
+            LookerUtil.extract_view_name_from_lookml_model_explore_field(field)
+            == view_name
+        ):
             # This path is relative to git clone directory
             logger.debug(f"Found view({view_name}) file-path {field.source_file}")
             return field.source_file
@@ -1103,7 +1106,7 @@ class LookerExplore:
                     [column_ref] if column_ref is not None else []
                 )
-            return cls(
+            looker_explore = cls(
                 name=explore_name,
                 model_name=model,
                 project_name=explore.project_name,
@@ -1121,6 +1124,8 @@ class LookerExplore:
                 source_file=explore.source_file,
                 tags=list(explore.tags) if explore.tags is not None else [],
             )
+            logger.debug(f"Created LookerExplore from API: {looker_explore}")
+            return looker_explore
         except SDKError as e:
             if "<title>Looker Not Found (404)</title>" in str(e):
                 logger.info(
@@ -1161,6 +1166,9 @@ class LookerExplore:
         dataset_name = config.explore_naming_pattern.replace_variables(
             self.get_mapping(config)
         )
+        logger.debug(
+            f"Generated dataset_name={dataset_name} for explore with model_name={self.model_name}, name={self.name}"
+        )
         return builder.make_dataset_urn_with_platform_instance(
             platform=config.platform_name,
@@ -1362,6 +1370,7 @@ class LookerExploreRegistry:
     @lru_cache(maxsize=200)
     def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
+        logger.debug(f"Retrieving explore: model={model}, explore={explore}")
         looker_explore = LookerExplore.from_api(
             model,
             explore,
@@ -1369,6 +1378,12 @@ class LookerExploreRegistry:
             self.report,
             self.source_config,
         )
+        if looker_explore is not None:
+            logger.debug(
+                f"Found explore with model_name={looker_explore.model_name}, name={looker_explore.name}"
+            )
+        else:
+            logger.debug(f"No explore found for model={model}, explore={explore}")
         return looker_explore
     def compute_stats(self) -> Dict:

datahub/ingestion/source/looker/looker_source.py CHANGED Viewed

@@ -279,6 +279,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             return []
         result = []
+        if query is not None:
+            logger.debug(
+                f"Processing query: model={query.model}, view={query.view}, input_fields_count={len(query.fields) if query.fields else 0}"
+            )
         # query.dynamic_fields can contain:
         # - looker table calculations: https://docs.looker.com/exploring-data/using-table-calculations
         # - looker custom measures: https://docs.looker.com/de/exploring-data/adding-fields/custom-measure
@@ -399,9 +404,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             # Get the explore from the view directly
             explores = [element.query.view] if element.query.view is not None else []
             logger.debug(
-                f"Element {element.title}: Explores added via query: {explores}"
+                f"Dashboard element {element.title} (ID: {element.id}): Upstream explores added via query={explores} with model={element.query.model}, explore={element.query.view}"
             )
             for exp in explores:
+                logger.debug(
+                    f"Adding reachable explore: model={element.query.model}, explore={exp}, element_id={element.id}, title={element.title}"
+                )
                 self.add_reachable_explore(
                     model=element.query.model,
                     explore=exp,
@@ -477,12 +485,10 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         # Failing the above two approaches, pick out details from result_maker
         elif element.result_maker is not None:
-            model: str = ""
             input_fields = []
             explores = []
             if element.result_maker.query is not None:
-                model = element.result_maker.query.model
                 if element.result_maker.query.view is not None:
                     explores.append(element.result_maker.query.view)
                 input_fields = self._get_input_fields_from_query(
@@ -502,9 +508,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             # In addition to the query, filters can point to fields as well
             assert element.result_maker.filterables is not None
+            # Different dashboard elements my reference explores from different models
+            # so we need to create a mapping of explore names to their models to maintain correct associations
+            explore_to_model_map = {}
             for filterable in element.result_maker.filterables:
                 if filterable.view is not None and filterable.model is not None:
-                    model = filterable.model
+                    # Store the model for this view/explore in our mapping
+                    explore_to_model_map[filterable.view] = filterable.model
                     explores.append(filterable.view)
                     self.add_reachable_explore(
                         model=filterable.model,
@@ -527,6 +539,18 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             explores = sorted(list(set(explores)))  # dedup the list of views
+            logger.debug(
+                f"Dashboard element {element.id} and their explores with the corresponding model: {explore_to_model_map}"
+            )
+            # If we have a query, use its model as the default for any explores that don't have a model in our mapping
+            default_model = ""
+            if (
+                element.result_maker.query is not None
+                and element.result_maker.query.model is not None
+            ):
+                default_model = element.result_maker.query.model
             return LookerDashboardElement(
                 id=element.id,
                 title=element.title if element.title is not None else "",
@@ -540,7 +564,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
                     else ""
                 ),
                 upstream_explores=[
-                    LookerExplore(model_name=model, name=exp) for exp in explores
+                    LookerExplore(
+                        model_name=explore_to_model_map.get(exp, default_model),
+                        name=exp,
+                    )
+                    for exp in explores
                 ],
                 input_fields=input_fields,
                 owner=None,
@@ -1270,6 +1298,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         chart_urn = self._make_chart_urn(
             element_id=dashboard_element.get_urn_element_id()
         )
         input_fields_aspect = InputFieldsClass(
             fields=self._input_fields_from_dashboard_element(dashboard_element)
         )

datahub/sdk/_shared.py CHANGED Viewed

@@ -41,7 +41,7 @@ from datahub.metadata.urns import (
     Urn,
     VersionSetUrn,
 )
-from datahub.sdk._utils import add_list_unique, remove_list_unique
+from datahub.sdk._utils import DEFAULT_ACTOR_URN, add_list_unique, remove_list_unique
 from datahub.sdk.entity import Entity
 from datahub.utilities.urns.error import InvalidUrnError
@@ -54,8 +54,6 @@ DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
 ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
-_DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
 TrainingMetricsInputType: TypeAlias = Union[
     List[models.MLMetricClass], Dict[str, Optional[str]]
 ]
@@ -475,7 +473,7 @@ class HasTerms(Entity):
     def _terms_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=0,
-            actor=_DEFAULT_ACTOR_URN,
+            actor=DEFAULT_ACTOR_URN,
         )
     def set_terms(self, terms: TermsInputType) -> None:
@@ -563,7 +561,7 @@ class HasInstitutionalMemory(Entity):
     def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=0,
-            actor=_DEFAULT_ACTOR_URN,
+            actor=DEFAULT_ACTOR_URN,
         )
     @classmethod

datahub/sdk/_utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from typing import Any, Callable, List, Protocol, TypeVar
 from datahub.errors import ItemNotFoundError
+from datahub.metadata.urns import CorpUserUrn
+# TODO: Change __ingestion to _ingestion.
+DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
 class _SupportsEq(Protocol):

datahub/sdk/dataset.py CHANGED Viewed

@@ -87,7 +87,7 @@ def _parse_upstream_input(
         assert_never(upstream_input)
-def _parse_cll_mapping(
+def parse_cll_mapping(
     *,
     upstream: DatasetUrnOrStr,
     downstream: DatasetUrnOrStr,
@@ -142,7 +142,7 @@ def _parse_upstream_lineage_input(
                 )
             )
             cll.extend(
-                _parse_cll_mapping(
+                parse_cll_mapping(
                     upstream=dataset_urn,
                     downstream=downstream_urn,
                     cll_mapping=column_lineage,

datahub/sdk/lineage_client.py ADDED Viewed

@@ -0,0 +1,235 @@
+from __future__ import annotations
+import difflib
+import logging
+from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
+import datahub.metadata.schema_classes as models
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.errors import SdkUsageError
+from datahub.metadata.schema_classes import SchemaMetadataClass
+from datahub.metadata.urns import DatasetUrn, QueryUrn
+from datahub.sdk._shared import DatasetUrnOrStr
+from datahub.sdk._utils import DEFAULT_ACTOR_URN
+from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
+from datahub.specific.dataset import DatasetPatchBuilder
+from datahub.sql_parsing.fingerprint_utils import generate_hash
+from datahub.utilities.ordered_set import OrderedSet
+if TYPE_CHECKING:
+    from datahub.sdk.main_client import DataHubClient
+logger = logging.getLogger(__name__)
+_empty_audit_stamp = models.AuditStampClass(
+    time=0,
+    actor=DEFAULT_ACTOR_URN,
+)
+class LineageClient:
+    def __init__(self, client: DataHubClient):
+        self._client = client
+    def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
+        schema_metadata = self._client._graph.get_aspect(
+            str(dataset_urn), SchemaMetadataClass
+        )
+        if schema_metadata is None:
+            return Set()
+        return {field.fieldPath for field in schema_metadata.fields}
+    @classmethod
+    def _get_strict_column_lineage(
+        cls,
+        upstream_fields: Set[str],
+        downstream_fields: Set[str],
+    ) -> ColumnLineageMapping:
+        """Find matches between upstream and downstream fields with case-insensitive matching."""
+        strict_column_lineage: ColumnLineageMapping = {}
+        # Create case-insensitive mapping of upstream fields
+        case_insensitive_map = {field.lower(): field for field in upstream_fields}
+        # Match downstream fields using case-insensitive comparison
+        for downstream_field in downstream_fields:
+            lower_field = downstream_field.lower()
+            if lower_field in case_insensitive_map:
+                # Use the original case of the upstream field
+                strict_column_lineage[downstream_field] = [
+                    case_insensitive_map[lower_field]
+                ]
+        return strict_column_lineage
+    @classmethod
+    def _get_fuzzy_column_lineage(
+        cls,
+        upstream_fields: Set[str],
+        downstream_fields: Set[str],
+    ) -> ColumnLineageMapping:
+        """Generate fuzzy matches between upstream and downstream fields."""
+        # Simple normalization function for better matching
+        def normalize(s: str) -> str:
+            return s.lower().replace("_", "")
+        # Create normalized lookup for upstream fields
+        normalized_upstream = {normalize(field): field for field in upstream_fields}
+        fuzzy_column_lineage = {}
+        for downstream_field in downstream_fields:
+            # Try exact match first
+            if downstream_field in upstream_fields:
+                fuzzy_column_lineage[downstream_field] = [downstream_field]
+                continue
+            # Try normalized match
+            norm_downstream = normalize(downstream_field)
+            if norm_downstream in normalized_upstream:
+                fuzzy_column_lineage[downstream_field] = [
+                    normalized_upstream[norm_downstream]
+                ]
+                continue
+            # If no direct match, find closest match using similarity
+            matches = difflib.get_close_matches(
+                norm_downstream,
+                normalized_upstream.keys(),
+                n=1,  # Return only the best match
+                cutoff=0.8,  # Adjust cutoff for sensitivity
+            )
+            if matches:
+                fuzzy_column_lineage[downstream_field] = [
+                    normalized_upstream[matches[0]]
+                ]
+        return fuzzy_column_lineage
+    def add_dataset_copy_lineage(
+        self,
+        *,
+        upstream: DatasetUrnOrStr,
+        downstream: DatasetUrnOrStr,
+        column_lineage: Union[
+            None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
+        ] = "auto_fuzzy",
+    ) -> None:
+        upstream = DatasetUrn.from_string(upstream)
+        downstream = DatasetUrn.from_string(downstream)
+        if column_lineage is None:
+            cll = None
+        elif column_lineage in ["auto_fuzzy", "auto_strict"]:
+            upstream_schema = self._get_fields_from_dataset_urn(upstream)
+            downstream_schema = self._get_fields_from_dataset_urn(downstream)
+            if column_lineage == "auto_fuzzy":
+                mapping = self._get_fuzzy_column_lineage(
+                    upstream_schema, downstream_schema
+                )
+            else:
+                mapping = self._get_strict_column_lineage(
+                    upstream_schema, downstream_schema
+                )
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=mapping,
+            )
+        elif isinstance(column_lineage, dict):
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=column_lineage,
+            )
+        updater = DatasetPatchBuilder(str(downstream))
+        updater.add_upstream_lineage(
+            models.UpstreamClass(
+                dataset=str(upstream),
+                type=models.DatasetLineageTypeClass.COPY,
+            )
+        )
+        for cl in cll or []:
+            updater.add_fine_grained_upstream_lineage(cl)
+        self._client.entities.update(updater)
+    def add_dataset_transform_lineage(
+        self,
+        *,
+        upstream: DatasetUrnOrStr,
+        downstream: DatasetUrnOrStr,
+        column_lineage: Optional[ColumnLineageMapping] = None,
+        query_text: Optional[str] = None,
+    ) -> None:
+        upstream = DatasetUrn.from_string(upstream)
+        downstream = DatasetUrn.from_string(downstream)
+        cll = None
+        if column_lineage is not None:
+            cll = parse_cll_mapping(
+                upstream=upstream,
+                downstream=downstream,
+                cll_mapping=column_lineage,
+            )
+        fields_involved = OrderedSet([str(upstream), str(downstream)])
+        if cll is not None:
+            for c in cll:
+                for field in c.upstreams or []:
+                    fields_involved.add(field)
+                for field in c.downstreams or []:
+                    fields_involved.add(field)
+        query_urn = None
+        query_entity = None
+        if query_text:
+            # Eventually we might want to use our regex-based fingerprinting instead.
+            fingerprint = generate_hash(query_text)
+            query_urn = QueryUrn(fingerprint).urn()
+            from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
+            query_entity = MetadataChangeProposalWrapper.construct_many(
+                query_urn,
+                aspects=[
+                    models.QueryPropertiesClass(
+                        statement=models.QueryStatementClass(
+                            value=query_text, language=models.QueryLanguageClass.SQL
+                        ),
+                        source=models.QuerySourceClass.SYSTEM,
+                        created=_empty_audit_stamp,
+                        lastModified=_empty_audit_stamp,
+                    ),
+                    make_query_subjects(list(fields_involved)),
+                ],
+            )
+        updater = DatasetPatchBuilder(str(downstream))
+        updater.add_upstream_lineage(
+            models.UpstreamClass(
+                dataset=str(upstream),
+                type=models.DatasetLineageTypeClass.TRANSFORMED,
+                query=query_urn,
+            )
+        )
+        for cl in cll or []:
+            cl.query = query_urn
+            updater.add_fine_grained_upstream_lineage(cl)
+        # Throw if the dataset does not exist.
+        # We need to manually call .build() instead of reusing client.update()
+        # so that we make just one emit_mcps call.
+        if not self._client._graph.exists(updater.urn):
+            raise SdkUsageError(
+                f"Dataset {updater.urn} does not exist, and hence cannot be updated."
+            )
+        mcps: List[
+            Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
+        ] = list(updater.build())
+        if query_entity:
+            mcps.extend(query_entity)
+        self._client._graph.emit_mcps(mcps)

datahub/sdk/main_client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from datahub.errors import SdkUsageError
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.ingestion.graph.config import DatahubClientConfig
 from datahub.sdk.entity_client import EntityClient
+from datahub.sdk.lineage_client import LineageClient
 from datahub.sdk.resolver_client import ResolverClient
 from datahub.sdk.search_client import SearchClient
@@ -99,4 +100,6 @@ class DataHubClient:
     def search(self) -> SearchClient:
         return SearchClient(self)
-    # TODO: lineage client
+    @property
+    def lineage(self) -> LineageClient:
+        return LineageClient(self)

datahub/sql_parsing/fingerprint_utils.py ADDED Viewed

@@ -0,0 +1,6 @@
+import hashlib
+def generate_hash(text: str) -> str:
+    # Once we move to Python 3.9+, we can set `usedforsecurity=False`.
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()

acryl-datahub 1.0.0.3rc5__py3-none-any.whl → 1.0.0.3rc7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.3rc5py3-none-any.whl → 1.0.0.3rc7py3-none-any.whl