PyPI - acryl-datahub - Versions diffs - 0.15.0rc20__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc20py3-none-any.whl → 0.15.0rc21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (17) hide show

{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/METADATA +2481 -2481
{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/RECORD +17 -17
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +56 -68
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/sink/datahub_rest.py +12 -1
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +2 -1
datahub/ingestion/source/snowflake/snowflake_query.py +13 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +16 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +23 -0
datahub/metadata/_schema_classes.py +400 -400
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/schema.avsc +17221 -17574
{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/top_level.txt +0 -0

{acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=fYgu28dsndrekGv9Pq_ENw7G6Erm7qtsY5H6W3cKFDU,575
+datahub/__init__.py,sha256=caUPlyD6P05EsMKzRYtlTS611d82sT4szr8_WAu_rJ4,575
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
 datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
 datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/api/entities/structuredproperties/structuredproperties.py,sha256=PcTX5gI7pg_Aq9JeIvUNZ5JYrQ2XS1uUEJZ73ORgYgA,9434
+datahub/api/entities/structuredproperties/structuredproperties.py,sha256=YO4mdn6BziOzvzoFe-g2KfZlOZy8gqwMyyzj_7vF4BY,8845
 datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
 datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
 datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
 datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
 datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
 datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
-datahub/emitter/rest_emitter.py,sha256=rIWqEJjcSIM16_8DXqNqZ_h5s_nj46DTiyRKA5EQHXQ,15021
+datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
 datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
 datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
 datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -180,7 +180,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
 datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
 datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
 datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
-datahub/ingestion/sink/datahub_rest.py,sha256=pU9z-vR-R7kGogqxkC7-9AZNctR9oUfAmfhhoD0-hwQ,12245
+datahub/ingestion/sink/datahub_rest.py,sha256=ME8OygJgd7AowrokJLmdjYHxIQEy5jXWS0yKwOLR934,12592
 datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
 datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
 datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -321,7 +321,7 @@ datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR
 datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
 datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
-datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
+datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
 datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
 datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
@@ -427,13 +427,13 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
 datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
 datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
 datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
-datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=w2CPm5XEU-KMUSIpb58aKOaxTDHfM5NvghutCVRicy4,23247
+datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=EnTJoRIQKcZOIYfb_NUff_YA8IdIroaFD1JHUn-M6ok,23346
 datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
 datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
-datahub/ingestion/source/snowflake/snowflake_query.py,sha256=PuqoseJbqkQEIYkmlLvPJxcVOGG7HVs4U-WWFQgQEWs,38211
+datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
 datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
-datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=K31vJ19ZCIqtJkszsJWF1eppu8U23gkZYfb5jw231dc,20997
-datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=st4qoOdMGuo6fJQh-cJf_2hnczIuv6VRXGO4x3p1MgQ,39416
+datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
+datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
 datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
 datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
 datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
@@ -559,12 +559,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
 datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
 datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
 datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
-datahub/metadata/_schema_classes.py,sha256=iPeBXGvbNEm0vw5pYwunnvx7bTtBdmIQVtzMOlS6bSI,955042
-datahub/metadata/schema.avsc,sha256=Xx93OdPzQfBb2CtntIYE-HAeKNg-JZcCtRU95v7ZZCs,677728
+datahub/metadata/_schema_classes.py,sha256=FTLom36n7gr6zxYfPWWoy9AmdnB4KOIXYRoVZbS9kog,955042
+datahub/metadata/schema.avsc,sha256=D-rNu2SC2tyvqju8pQwGNGGT9zy1_fzxzoigH5YmUvo,722242
 datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
 datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
 datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
-datahub/metadata/_urns/urn_defs.py,sha256=WBHf7Ze2qBvR-uWpcdMqEy-T2AIBzf8ioS-wJMMXXOo,107119
+datahub/metadata/_urns/urn_defs.py,sha256=LFHZGzHlDA0KJes1Xg7-lWetXusi7bubA7Q5hu4ER88,107119
 datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
 datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
 datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0rc20.dist-info/METADATA,sha256=KuTZA5lnEW-UAvSPqqkBsDFKkwlJF8WzYbcphVMW_aE,173559
-acryl_datahub-0.15.0rc20.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-acryl_datahub-0.15.0rc20.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
-acryl_datahub-0.15.0rc20.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0rc20.dist-info/RECORD,,
+acryl_datahub-0.15.0rc21.dist-info/METADATA,sha256=e3Tw7Cix7Z1uR8zyUtppjUv0ztJa2Kga0yl7nwPMbF8,173559
+acryl_datahub-0.15.0rc21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+acryl_datahub-0.15.0rc21.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
+acryl_datahub-0.15.0rc21.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0rc21.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0rc20"
+__version__ = "0.15.0rc21"
 def is_dev_mode() -> bool:

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import logging
-from contextlib import contextmanager
 from enum import Enum
 from pathlib import Path
-from typing import Generator, List, Optional
+from typing import List, Optional
 import yaml
 from pydantic import validator
@@ -10,6 +9,7 @@ from ruamel.yaml import YAML
 from datahub.configuration.common import ConfigModel
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
 from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.metadata.schema_classes import (
     PropertyValueClass,
@@ -24,23 +24,10 @@ logger = logging.getLogger(__name__)
 class StructuredPropertiesConfig:
     """Configuration class to hold the graph client"""
-    _graph: Optional[DataHubGraph] = None
-    @classmethod
-    @contextmanager
-    def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
-        """Context manager to temporarily set a custom graph"""
-        previous_graph = cls._graph
-        cls._graph = graph
-        try:
-            yield
-        finally:
-            cls._graph = previous_graph
     @classmethod
-    def get_graph(cls) -> DataHubGraph:
+    def get_graph_required(cls) -> DataHubGraph:
         """Get the current graph, falling back to default if none set"""
-        return cls._graph if cls._graph is not None else get_default_graph()
+        return get_graph_context() or get_default_graph()
 class AllowedTypes(Enum):
@@ -79,7 +66,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
     @validator("allowed_types", each_item=True)
     def validate_allowed_types(cls, v):
         if v:
-            graph = StructuredPropertiesConfig.get_graph()
+            graph = StructuredPropertiesConfig.get_graph_required()
             validated_urn = Urn.make_entity_type_urn(v)
             if not graph.exists(validated_urn):
                 raise ValueError(
@@ -106,7 +93,7 @@ class StructuredProperties(ConfigModel):
     @validator("entity_types", each_item=True)
     def validate_entity_types(cls, v):
         if v:
-            graph = StructuredPropertiesConfig.get_graph()
+            graph = StructuredPropertiesConfig.get_graph_required()
             validated_urn = Urn.make_entity_type_urn(v)
             if not graph.exists(validated_urn):
                 raise ValueError(
@@ -136,63 +123,64 @@ class StructuredProperties(ConfigModel):
     @staticmethod
     def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
-        emitter: DataHubGraph = graph if graph else get_default_graph()
-        with StructuredPropertiesConfig.use_graph(emitter):
-            print("Using graph")
+        with set_graph_context(graph):
+            graph = StructuredPropertiesConfig.get_graph_required()
             with open(file) as fp:
                 structuredproperties: List[dict] = yaml.safe_load(fp)
-                for structuredproperty_raw in structuredproperties:
-                    structuredproperty = StructuredProperties.parse_obj(
-                        structuredproperty_raw
+            for structuredproperty_raw in structuredproperties:
+                structuredproperty = StructuredProperties.parse_obj(
+                    structuredproperty_raw
+                )
+                if not structuredproperty.type.islower():
+                    structuredproperty.type = structuredproperty.type.lower()
+                    logger.warning(
+                        f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
                     )
-                    if not structuredproperty.type.islower():
-                        structuredproperty.type = structuredproperty.type.lower()
-                        logger.warn(
-                            f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
-                        )
-                    if not AllowedTypes.check_allowed_type(structuredproperty.type):
-                        raise ValueError(
-                            f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
-                        )
-                    mcp = MetadataChangeProposalWrapper(
-                        entityUrn=structuredproperty.urn,
-                        aspect=StructuredPropertyDefinitionClass(
-                            qualifiedName=structuredproperty.fqn,
-                            valueType=Urn.make_data_type_urn(structuredproperty.type),
-                            displayName=structuredproperty.display_name,
-                            description=structuredproperty.description,
-                            entityTypes=[
-                                Urn.make_entity_type_urn(entity_type)
-                                for entity_type in structuredproperty.entity_types or []
-                            ],
-                            cardinality=structuredproperty.cardinality,
-                            immutable=structuredproperty.immutable,
-                            allowedValues=(
-                                [
-                                    PropertyValueClass(
-                                        value=v.value, description=v.description
-                                    )
-                                    for v in structuredproperty.allowed_values
-                                ]
-                                if structuredproperty.allowed_values
-                                else None
-                            ),
-                            typeQualifier=(
-                                {
-                                    "allowedTypes": structuredproperty.type_qualifier.allowed_types
-                                }
-                                if structuredproperty.type_qualifier
-                                else None
-                            ),
-                        ),
+                if not AllowedTypes.check_allowed_type(structuredproperty.type):
+                    raise ValueError(
+                        f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
                     )
-                    emitter.emit_mcp(mcp)
+                mcp = MetadataChangeProposalWrapper(
+                    entityUrn=structuredproperty.urn,
+                    aspect=StructuredPropertyDefinitionClass(
+                        qualifiedName=structuredproperty.fqn,
+                        valueType=Urn.make_data_type_urn(structuredproperty.type),
+                        displayName=structuredproperty.display_name,
+                        description=structuredproperty.description,
+                        entityTypes=[
+                            Urn.make_entity_type_urn(entity_type)
+                            for entity_type in structuredproperty.entity_types or []
+                        ],
+                        cardinality=structuredproperty.cardinality,
+                        immutable=structuredproperty.immutable,
+                        allowedValues=(
+                            [
+                                PropertyValueClass(
+                                    value=v.value, description=v.description
+                                )
+                                for v in structuredproperty.allowed_values
+                            ]
+                            if structuredproperty.allowed_values
+                            else None
+                        ),
+                        typeQualifier=(
+                            {
+                                "allowedTypes": structuredproperty.type_qualifier.allowed_types
+                            }
+                            if structuredproperty.type_qualifier
+                            else None
+                        ),
+                    ),
+                )
+                graph.emit_mcp(mcp)
-                    logger.info(f"Created structured property {structuredproperty.urn}")
+                logger.info(f"Created structured property {structuredproperty.urn}")
     @classmethod
     def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
-        with StructuredPropertiesConfig.use_graph(graph):
+        with set_graph_context(graph):
             structured_property: Optional[
                 StructuredPropertyDefinitionClass
             ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
     os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
 )
-# The limit is 16mb. We will use a max of 15mb to have some space for overhead.
-_MAX_BATCH_INGEST_PAYLOAD_SIZE = 15 * 1024 * 1024
+# The limit is 16mb. We will use a max of 15mb to have some space
+# for overhead like request headers.
+# This applies to pretty much all calls to GMS.
+INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
+# This limit is somewhat arbitrary. All GMS endpoints will timeout
+# and return a 500 if processing takes too long. To avoid sending
+# too much to the backend and hitting a timeout, we try to limit
+# the number of MCPs we send in a batch.
+BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
+    os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
+)
 class DataHubRestEmitter(Closeable, Emitter):
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
         # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
         # If we will exceed the limit, we need to break it up into chunks.
         mcp_obj_chunks: List[List[str]] = []
-        current_chunk_size = _MAX_BATCH_INGEST_PAYLOAD_SIZE
+        current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
-            if mcp_obj_size + current_chunk_size > _MAX_BATCH_INGEST_PAYLOAD_SIZE:
+            if (
+                mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
+                or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
+            ):
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)

datahub/ingestion/sink/datahub_rest.py CHANGED Viewed

@@ -18,7 +18,10 @@ from datahub.configuration.common import (
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import mcps_from_mce
-from datahub.emitter.rest_emitter import DataHubRestEmitter
+from datahub.emitter.rest_emitter import (
+    BATCH_INGEST_MAX_PAYLOAD_LENGTH,
+    DataHubRestEmitter,
+)
 from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
 from datahub.ingestion.api.sink import (
     NoopWriteCallback,
@@ -71,6 +74,14 @@ class DatahubRestSinkConfig(DatahubClientConfig):
     # Only applies in async batch mode.
     max_per_batch: pydantic.PositiveInt = 100
+    @pydantic.validator("max_per_batch", always=True)
+    def validate_max_per_batch(cls, v):
+        if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
+            raise ValueError(
+                f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
+            )
+        return v
 @dataclasses.dataclass
 class DataHubRestSinkReport(SinkReport):

datahub/ingestion/source/kafka/kafka_connect.py CHANGED Viewed

@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
         query: str
         transforms: list
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
                     source_table = f"{table_name_tuple[-2]}.{source_table}"
                 else:
                     include_source_dataset = False
-                    self.report_warning(
-                        self.connector_manifest.name,
-                        f"could not find schema for table {source_table}",
+                    self.report.warning(
+                        "Could not find schema for table"
+                        f"{self.connector_manifest.name} : {source_table}",
                     )
             dataset_name: str = get_dataset_name(database_name, source_table)
             lineage = KafkaConnectLineage(
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
                     target_platform=KAFKA,
                 )
                 lineages.append(lineage)
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, the connector has query configuration set",
                     self.connector_manifest.name,
-                    "could not find input dataset, the connector has query configuration set",
                 )
                 self.connector_manifest.lineages = lineages
                 return
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
                         include_source_dataset=False,
                     )
                 )
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, for connector topics {topic_names}",
+                self.report.warning(
+                    "Could not find input dataset for connector topics",
+                    f"{self.connector_manifest.name} : {topic_names}",
                 )
             self.connector_manifest.lineages = lineages
             return
         else:
             include_source_dataset = True
             if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}",
+                self.report.warning(
+                    "Could not find input dataset, connector has unknown transform",
+                    f"{self.connector_manifest.name} : {transforms[0]['type']}",
                 )
                 include_source_dataset = False
             if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, connector has one or more unknown transforms",
                     self.connector_manifest.name,
-                    "could not find input dataset, connector has one or more unknown transforms",
                 )
                 include_source_dataset = False
             lineages = self.default_get_lineages(
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
                     lineages.append(lineage)
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
         defaultDataset: Optional[str] = None
         version: str = "v1"
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
             transformed_topic = self.apply_transformations(topic, transforms)
             dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
             if dataset_table is None:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find target dataset for topic {transformed_topic}, please check your connector configuration",
+                self.report.warning(
+                    "Could not find target dataset for topic, please check your connector configuration"
+                    f"{self.connector_manifest.name} : {transformed_topic} ",
                 )
                 continue
             target_dataset = f"{project}.{dataset_table}"
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
         schema_name: str
         topics_to_tables: Dict[str, str]
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_table_name_from_topic_name(self, topic_name: str) -> str:
         """
         This function converts the topic name to a valid Snowflake table name using some rules.
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
                 )
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             )
             self.session.auth = (self.config.username, self.config.password)
-        test_response = self.session.get(f"{self.config.connect_uri}")
+        test_response = self.session.get(f"{self.config.connect_uri}/connectors")
         test_response.raise_for_status()
         logger.info(f"Connection to {self.config.connect_uri} is ok")
         if not jpype.isJVMStarted():
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         payload = connector_response.json()
-        for c in payload:
-            connector_url = f"{self.config.connect_uri}/connectors/{c}"
-            connector_response = self.session.get(connector_url)
-            manifest = connector_response.json()
-            connector_manifest = ConnectorManifest(**manifest)
-            if not self.config.connector_patterns.allowed(connector_manifest.name):
-                self.report.report_dropped(connector_manifest.name)
+        for connector_name in payload:
+            connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
+            connector_manifest = self._get_connector_manifest(
+                connector_name, connector_url
+            )
+            if (
+                connector_manifest is None
+                or not self.config.connector_patterns.allowed(connector_manifest.name)
+            ):
+                self.report.report_dropped(connector_name)
                 continue
             if self.config.provided_configs:
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             connector_manifest.lineages = list()
             connector_manifest.url = connector_url
-            topics = self.session.get(
-                f"{self.config.connect_uri}/connectors/{c}/topics",
-            ).json()
-            connector_manifest.topic_names = topics[c]["topics"]
+            connector_manifest.topic_names = self._get_connector_topics(connector_name)
             # Populate Source Connector metadata
             if connector_manifest.type == SOURCE:
-                tasks = self.session.get(
-                    f"{self.config.connect_uri}/connectors/{c}/tasks",
-                ).json()
-                connector_manifest.tasks = tasks
+                connector_manifest.tasks = self._get_connector_tasks(connector_name)
                 # JDBC source connector lineages
                 if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
                         )
                         continue
-                    for topic in topics:
+                    for topic in connector_manifest.topic_names:
                         lineage = KafkaConnectLineage(
                             source_dataset=target_connector.source_dataset,
                             source_platform=target_connector.source_platform,
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         return connectors_manifest
+    def _get_connector_manifest(
+        self, connector_name: str, connector_url: str
+    ) -> Optional[ConnectorManifest]:
+        try:
+            connector_response = self.session.get(connector_url)
+            connector_response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Failed to get connector details", connector_name, exc=e
+            )
+            return None
+        manifest = connector_response.json()
+        connector_manifest = ConnectorManifest(**manifest)
+        return connector_manifest
+    def _get_connector_tasks(self, connector_name: str) -> dict:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector tasks", context=connector_name, exc=e
+            )
+            return {}
+        return response.json()
+    def _get_connector_topics(self, connector_name: str) -> List[str]:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/topics",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector topics", context=connector_name, exc=e
+            )
+            return []
+        return response.json()[connector_name]["topics"]
     def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
         connector_name = connector.name
         connector_type = connector.type

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -413,9 +413,10 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
             return UpstreamLineageEdge.parse_obj(db_row)
         except Exception as e:
             self.report.num_upstream_lineage_edge_parsing_failed += 1
+            upstream_tables = db_row.get("UPSTREAM_TABLES")
             self.structured_reporter.warning(
                 "Failed to parse lineage edge",
-                context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
+                context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
                 exc=e,
             )
             return None

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -237,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
 LIMIT {limit} {from_clause};
 """
+    @staticmethod
+    def get_secure_view_definitions() -> str:
+        # https://docs.snowflake.com/en/sql-reference/account-usage/views
+        return """
+            SELECT
+                TABLE_CATALOG as "TABLE_CATALOG",
+                TABLE_SCHEMA as "TABLE_SCHEMA",
+                TABLE_NAME as "TABLE_NAME",
+                VIEW_DEFINITION as "VIEW_DEFINITION"
+            FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
+            WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
+        """
     @staticmethod
     def columns_for_schema(
         schema_name: str,

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -266,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
             snowflake_schemas.append(snowflake_schema)
         return snowflake_schemas
+    @serialized_lru_cache(maxsize=1)
+    def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
+        secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
+            lambda: defaultdict(lambda: defaultdict())
+        )
+        cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
+        for view in cur:
+            db_name = view["TABLE_CATALOG"]
+            schema_name = view["TABLE_SCHEMA"]
+            view_name = view["TABLE_NAME"]
+            secure_view_definitions[db_name][schema_name][view_name] = view[
+                "VIEW_DEFINITION"
+            ]
+        return secure_view_definitions
     @serialized_lru_cache(maxsize=1)
     def get_tables_for_database(
         self, db_name: str

acryl-datahub 0.15.0rc20__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc20py3-none-any.whl → 0.15.0rc21py3-none-any.whl