PyPI - datahub-agent-context - Versions diffs - 1.3.1.8__py3-none-any.whl - Mend

datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

datahub_agent_context/__init__.py +25 -0
datahub_agent_context/_version.py +16 -0
datahub_agent_context/context.py +97 -0
datahub_agent_context/langchain_tools/__init__.py +8 -0
datahub_agent_context/langchain_tools/builder.py +127 -0
datahub_agent_context/mcp_tools/__init__.py +46 -0
datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
datahub_agent_context/mcp_tools/base.py +325 -0
datahub_agent_context/mcp_tools/descriptions.py +299 -0
datahub_agent_context/mcp_tools/documents.py +473 -0
datahub_agent_context/mcp_tools/domains.py +246 -0
datahub_agent_context/mcp_tools/entities.py +349 -0
datahub_agent_context/mcp_tools/get_me.py +99 -0
datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
datahub_agent_context/mcp_tools/gql/search.gql +242 -0
datahub_agent_context/mcp_tools/helpers.py +448 -0
datahub_agent_context/mcp_tools/lineage.py +698 -0
datahub_agent_context/mcp_tools/owners.py +318 -0
datahub_agent_context/mcp_tools/queries.py +191 -0
datahub_agent_context/mcp_tools/search.py +239 -0
datahub_agent_context/mcp_tools/structured_properties.py +447 -0
datahub_agent_context/mcp_tools/tags.py +296 -0
datahub_agent_context/mcp_tools/terms.py +295 -0
datahub_agent_context/py.typed +2 -0
datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0

datahub_agent_context/mcp_tools/base.py ADDED Viewed

@@ -0,0 +1,325 @@
+import logging
+import re
+from typing import Any, Dict, Optional
+import cachetools
+from datahub.cli.env_utils import get_boolean_env_variable
+from datahub.ingestion.graph.client import DataHubGraph
+logger = logging.getLogger(__name__)
+# Cache to track whether newer GMS fields are supported for each graph instance
+# Key: id(graph), Value: bool indicating if newer GMS fields are supported
+_newer_gms_fields_support_cache: dict[int, bool] = {}
+# Default view configuration
+DISABLE_DEFAULT_VIEW = get_boolean_env_variable(
+    "DATAHUB_MCP_DISABLE_DEFAULT_VIEW", default=False
+)
+VIEW_CACHE_TTL_SECONDS = 300  # 5 minutes
+def execute_graphql(
+    graph: DataHubGraph,
+    *,
+    query: str,
+    operation_name: Optional[str] = None,
+    variables: Optional[Dict[str, Any]] = None,
+) -> Any:
+    graph_id = id(graph)
+    original_query = query  # Keep original for fallback
+    # Detect if this is a DataHub Cloud instance
+    is_cloud = _is_datahub_cloud(graph)
+    # Process CLOUD tags
+    query = _enable_cloud_fields(query) if is_cloud else _disable_cloud_fields(query)
+    # Process NEWER_GMS tags
+    # Check if we've already determined newer GMS fields support for this graph
+    newer_gms_enabled_for_this_query = False
+    if graph_id in _newer_gms_fields_support_cache:
+        supports_newer_fields = _newer_gms_fields_support_cache[graph_id]
+        if supports_newer_fields:
+            query = _enable_newer_gms_fields(query)
+            newer_gms_enabled_for_this_query = True
+        else:
+            query = _disable_newer_gms_fields(query)
+    else:
+        # First attempt: try with newer GMS fields if it's detected as cloud
+        # (Cloud instances typically run newer GMS versions)
+        if is_cloud:
+            query = _enable_newer_gms_fields(query)
+            newer_gms_enabled_for_this_query = True
+        else:
+            query = _disable_newer_gms_fields(query)
+        # Cache the initial detection result
+        _newer_gms_fields_support_cache[graph_id] = is_cloud
+    logger.debug(
+        f"Executing GraphQL {operation_name or 'query'}: "
+        f"is_cloud={is_cloud}, newer_gms_enabled={newer_gms_enabled_for_this_query}"
+    )
+    logger.debug(
+        f"GraphQL query for {operation_name or 'query'}:\n{query}\nVariables: {variables}"
+    )
+    try:
+        # Execute the GraphQL query
+        result = graph.execute_graphql(
+            query=query, variables=variables, operation_name=operation_name
+        )
+        return result
+    except Exception as e:
+        error_msg = str(e)
+        # Check if this is a field validation error and we tried with newer GMS fields enabled
+        # Only retry if we had newer GMS fields enabled in the query that just failed
+        if _is_field_validation_error(error_msg) and newer_gms_enabled_for_this_query:
+            logger.warning(
+                f"GraphQL schema validation error detected for {operation_name or 'query'}. "
+                f"Retrying without newer GMS fields as fallback."
+            )
+            logger.exception(e)
+            # Update cache to indicate newer GMS fields are NOT supported
+            _newer_gms_fields_support_cache[graph_id] = False
+            # Retry with newer GMS fields disabled - process both tags again
+            try:
+                fallback_query = original_query
+                # Reprocess CLOUD tags
+                if is_cloud:
+                    fallback_query = _enable_cloud_fields(fallback_query)
+                else:
+                    fallback_query = _disable_cloud_fields(fallback_query)
+                # Disable newer GMS fields for fallback
+                fallback_query = _disable_newer_gms_fields(fallback_query)
+                logger.debug(
+                    f"Retry {operation_name or 'query'} with NEWER_GMS fields disabled: "
+                    f"is_cloud={is_cloud}"
+                )
+                result = graph.execute_graphql(
+                    query=fallback_query,
+                    variables=variables,
+                    operation_name=operation_name,
+                )
+                logger.info(
+                    f"Fallback query succeeded without newer GMS fields for operation: {operation_name}"
+                )
+                return result
+            except Exception as fallback_error:
+                logger.exception(
+                    f"Fallback query also failed for {operation_name or 'query'}: {fallback_error}"
+                )
+                raise fallback_error
+        elif (
+            _is_field_validation_error(error_msg)
+            and not newer_gms_enabled_for_this_query
+        ):
+            # Field validation error but NEWER_GMS fields were already disabled
+            logger.error(
+                f"GraphQL schema validation error for {operation_name or 'query'} "
+                f"but NEWER_GMS fields were already disabled (is_cloud={is_cloud}). "
+                f"This may indicate a CLOUD-only field being used on a non-cloud instance, "
+                f"or a field that's unavailable in this GMS version."
+            )
+            logger.exception(e)
+        # Keep essential error logging for troubleshooting with full stack trace
+        logger.exception(
+            f"GraphQL {operation_name or 'query'} failed: {e}\n"
+            f"Cloud instance: {is_cloud}\n"
+            f"Newer GMS fields enabled: {_newer_gms_fields_support_cache.get(graph_id, 'unknown')}\n"
+            f"Variables: {variables}"
+        )
+        raise
+def _is_datahub_cloud(graph: DataHubGraph) -> bool:
+    """Check if the graph instance is DataHub Cloud.
+    Cloud instances typically have newer GMS versions with additional fields.
+    This heuristic uses the presence of frontend_base_url to detect Cloud instances.
+    """
+    if get_boolean_env_variable("DISABLE_NEWER_GMS_FIELD_DETECTION", default=False):
+        logger.debug(
+            "Newer GMS field detection is disabled via DISABLE_NEWER_GMS_FIELD_DETECTION"
+        )
+        return False
+    is_cloud = hasattr(graph, "frontend_base_url") and graph.frontend_base_url
+    logger.debug(f"Cloud detection: {is_cloud}")
+    return bool(is_cloud)
+def _is_field_validation_error(error_msg: str) -> bool:
+    """Check if the error is a GraphQL field/type validation or syntax error.
+    Includes InvalidSyntax because unknown types (like Document on older GMS)
+    cause syntax errors rather than validation errors.
+    """
+    return (
+        "FieldUndefined" in error_msg
+        or "ValidationError" in error_msg
+        or "InvalidSyntax" in error_msg
+    )
+def _enable_newer_gms_fields(query: str) -> str:
+    """
+    Enable newer GMS fields by removing the #[NEWER_GMS] marker suffix.
+    Converts:
+        someField  #[NEWER_GMS]
+    To:
+        someField
+    """
+    lines = query.split("\n")
+    cleaned_lines = [
+        line.replace(" #[NEWER_GMS]", "").replace("\t#[NEWER_GMS]", "")
+        for line in lines
+    ]
+    return "\n".join(cleaned_lines)
+def _disable_newer_gms_fields(query: str) -> str:
+    """
+    Disable newer GMS fields by commenting out lines with #[NEWER_GMS] marker.
+    Converts:
+        someField  #[NEWER_GMS]
+    To:
+        # someField  #[NEWER_GMS]
+    """
+    lines = query.split("\n")
+    processed_lines = []
+    for line in lines:
+        if "#[NEWER_GMS]" in line:
+            # Comment out the line by prefixing with #
+            processed_lines.append("# " + line)
+        else:
+            processed_lines.append(line)
+    return "\n".join(processed_lines)
+def _enable_cloud_fields(query: str) -> str:
+    """
+    Enable cloud fields by removing the #[CLOUD] marker suffix.
+    Converts:
+        someField  #[CLOUD]
+    To:
+        someField
+    """
+    lines = query.split("\n")
+    cleaned_lines = [
+        line.replace(" #[CLOUD]", "").replace("\t#[CLOUD]", "") for line in lines
+    ]
+    return "\n".join(cleaned_lines)
+def _disable_cloud_fields(query: str) -> str:
+    """
+    Disable cloud fields by commenting out lines with #[CLOUD] marker.
+    Converts:
+        someField  #[CLOUD]
+    To:
+        # someField  #[CLOUD]
+    """
+    lines = query.split("\n")
+    processed_lines = []
+    for line in lines:
+        if "#[CLOUD]" in line:
+            # Comment out the line by prefixing with #
+            processed_lines.append("# " + line)
+        else:
+            processed_lines.append(line)
+    return "\n".join(processed_lines)
+@cachetools.cached(cache=cachetools.TTLCache(maxsize=1, ttl=VIEW_CACHE_TTL_SECONDS))
+def fetch_global_default_view(graph: DataHubGraph) -> Optional[str]:
+    """
+    Fetch the organization's default global view URN unless disabled.
+    Cached for VIEW_CACHE_TTL_SECONDS seconds.
+    Returns None if disabled or if no default view is configured.
+    """
+    # Return None immediately if feature is disabled
+    if DISABLE_DEFAULT_VIEW:
+        return None
+    query = """
+    query getGlobalViewsSettings {
+        globalViewsSettings {
+            defaultView
+        }
+    }
+    """
+    result = execute_graphql(graph, query=query)
+    settings = result.get("globalViewsSettings")
+    if settings:
+        view_urn = settings.get("defaultView")
+        if view_urn:
+            logger.debug(f"Fetched global default view: {view_urn}")
+            return view_urn
+    logger.debug("No global default view configured")
+    return None
+def clean_gql_response(response: Any) -> Any:
+    """
+    Clean GraphQL response by removing metadata and empty values.
+    Recursively removes:
+    - __typename fields (GraphQL metadata not useful for consumers)
+    - None values
+    - Empty arrays []
+    - Empty dicts {} (after cleaning)
+    - Base64-encoded images from description fields (can be huge - 2MB!)
+    Args:
+        response: Raw GraphQL response (dict, list, or primitive)
+    Returns:
+        Cleaned response with same structure but without noise
+    """
+    if isinstance(response, dict):
+        banned_keys = {
+            "__typename",
+        }
+        cleaned_response = {}
+        for k, v in response.items():
+            if k in banned_keys or v is None or v == []:
+                continue
+            cleaned_v = clean_gql_response(v)
+            # Strip base64 images from description fields
+            if (
+                k == "description"
+                and isinstance(cleaned_v, str)
+                and "base64" in cleaned_v
+            ):
+                cleaned_v = re.sub(
+                    r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+",
+                    "[image removed]",
+                    cleaned_v,
+                )
+                cleaned_v = re.sub(
+                    r"!\[[^\]]*\]\(data:image/[^)]+\)", "[image removed]", cleaned_v
+                )
+            if cleaned_v is not None and cleaned_v != {}:
+                cleaned_response[k] = cleaned_v
+        return cleaned_response
+    elif isinstance(response, list):
+        return [clean_gql_response(item) for item in response]
+    else:
+        return response

datahub_agent_context/mcp_tools/descriptions.py ADDED Viewed

@@ -0,0 +1,299 @@
+"""Description management tools for DataHub MCP server."""
+import logging
+from typing import Literal, Optional
+from datahub_agent_context.context import get_graph
+from datahub_agent_context.mcp_tools.base import execute_graphql
+logger = logging.getLogger(__name__)
+def _get_existing_description(entity_urn: str, column_path: Optional[str]) -> str:
+    """Fetch existing description for entity or column."""
+    graph = get_graph()
+    query = """
+        query getEntity($urn: String!) {
+            entity(urn: $urn) {
+                ... on Dataset {
+                    editableProperties {
+                        description
+                    }
+                    schemaMetadata {
+                        fields {
+                            fieldPath
+                            description
+                        }
+                    }
+                }
+                ... on Container {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on Chart {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on Dashboard {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on DataFlow {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on DataJob {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on MLModel {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on MLModelGroup {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on MLFeatureTable {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on MLPrimaryKey {
+                    editableProperties {
+                        description
+                    }
+                }
+                ... on Tag {
+                    properties {
+                        description
+                    }
+                }
+                ... on GlossaryTerm {
+                    properties {
+                        description
+                    }
+                }
+                ... on GlossaryNode {
+                    properties {
+                        description
+                    }
+                }
+                ... on Domain {
+                    properties {
+                        description
+                    }
+                }
+            }
+        }
+    """
+    try:
+        result = execute_graphql(
+            graph,
+            query=query,
+            variables={"urn": entity_urn},
+            operation_name="getEntity",
+        )
+        entity_data = result.get("entity", {})
+        if column_path:
+            # Get column description
+            schema_metadata = entity_data.get("schemaMetadata", {})
+            fields = schema_metadata.get("fields", [])
+            for field in fields:
+                if field.get("fieldPath") == column_path:
+                    return field.get("description", "")
+            return ""
+        else:
+            # Get entity description
+            # Try editableProperties first (for Dataset, Container, etc.)
+            editable_props = entity_data.get("editableProperties", {})
+            existing_description = editable_props.get("description", "")
+            # If not found, try properties (for Tag, GlossaryTerm, etc.)
+            if not existing_description:
+                properties = entity_data.get("properties", {})
+                existing_description = properties.get("description", "")
+            return existing_description
+    except Exception as e:
+        logger.warning(
+            f"Failed to fetch existing description for {entity_urn}: {e}. Will treat as empty."
+        )
+        return ""
+def update_description(
+    entity_urn: str,
+    operation: Literal["replace", "append", "remove"] = "replace",
+    description: Optional[str] = None,
+    column_path: Optional[str] = None,
+) -> dict:
+    """Update description for a DataHub entity or its column (e.g., schema field).
+    This tool allows you to set, append to, or remove a description for an entity or its column.
+    Useful for documenting datasets, containers, charts, dashboards, data flows, data jobs,
+    ML models, ML model groups, ML feature tables, ML primary keys, tags, glossary terms,
+    glossary nodes, domains, and schema fields.
+    Args:
+        entity_urn: Entity URN to update description for (e.g., dataset URN, container URN)
+        operation: The operation to perform:
+                  - "replace": Replace the existing description with the new one (default)
+                  - "append": Append the new description to the existing one
+                  - "remove": Remove the description (description parameter not needed)
+        description: The description text to set or append (supports markdown formatting).
+                    Required for "replace" and "append" operations, ignored for "remove".
+        column_path: Column_path identifier (e.g., column name for schema field).
+                    Optional for all entity types (use None for entity-level descriptions).
+                    For column-level descriptions, provide the column name (e.g., "customer_email").
+                    Verify that the column_path is correct and valid via the schemaMetadata.
+                    Use get_entity tool to verify.
+    Returns:
+        Dictionary with:
+        - success: Boolean indicating if the operation succeeded
+        - urn: The entity URN
+        - column_path: The column path (if applicable)
+        - message: Success or error message
+    Examples:
+        # Update description for a container (entity-level)
+        update_description(
+            entity_urn="urn:li:container:12345",
+            operation="replace",
+            description="Production data warehouse"
+        )
+        # Update description for a dataset
+        update_description(
+            entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
+            operation="replace",
+            description="User's table",
+        )
+        # Update description for a dataset field (column-level)
+        update_description(
+            entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
+            operation="replace",
+            description="User's primary email address",
+            column_path="email"
+        )
+        # Append to existing field description
+        update_description(
+            entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
+            operation="append",
+            description=" (PII)",
+            column_path="email"
+        )
+        # Remove field description
+        update_description(
+            entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
+            operation="remove",
+            column_path="old_field"
+        )
+    Example:
+        from datahub_agent_context.context import DataHubContext
+        with DataHubContext(client.graph):
+            result = update_description(
+                entity_urn="urn:li:dataset:(...)",
+                operation="replace",
+                description="User table"
+            )
+    """
+    graph = get_graph()
+    # Validate inputs
+    if not entity_urn:
+        raise ValueError("entity_urn cannot be empty")
+    if operation in ("replace", "append"):
+        if not description:
+            raise ValueError(f"description is required for '{operation}' operation")
+    elif operation == "remove":
+        # For remove operation, ignore description parameter
+        description = ""
+    else:
+        raise ValueError(
+            f"Invalid operation '{operation}'. Must be 'replace', 'append', or 'remove'"
+        )
+    # For append operation, we need to fetch existing description first
+    existing_description = ""
+    if operation == "append":
+        existing_description = _get_existing_description(entity_urn, column_path)
+    # Determine final description based on operation
+    if operation == "append":
+        final_description = (
+            existing_description + description if existing_description else description
+        )
+    elif operation == "remove":
+        final_description = ""
+    else:  # replace
+        final_description = description
+    # GraphQL mutation
+    mutation = """
+        mutation updateDescription($input: DescriptionUpdateInput!) {
+            updateDescription(input: $input)
+        }
+    """
+    variables: dict = {
+        "input": {
+            "description": final_description,
+            "resourceUrn": entity_urn,
+        }
+    }
+    # Add subresource fields if provided (for column-level descriptions)
+    if column_path:
+        variables["input"]["subResource"] = column_path
+        variables["input"]["subResourceType"] = "DATASET_FIELD"
+    try:
+        result = execute_graphql(
+            graph,
+            query=mutation,
+            variables=variables,
+            operation_name="updateDescription",
+        )
+        if result.get("updateDescription", False):
+            action_verb = "updated" if operation in ("replace", "append") else "removed"
+            return {
+                "success": True,
+                "urn": entity_urn,
+                "column_path": column_path,
+                "message": f"Description {action_verb} successfully",
+            }
+        else:
+            action = "update" if operation in ("replace", "append") else "remove"
+            raise RuntimeError(
+                f"Failed to {action} description for {entity_urn}"
+                + (f" column {column_path}" if column_path else "")
+                + " - operation returned false"
+            )
+    except Exception as e:
+        if isinstance(e, RuntimeError):
+            raise
+        action = "update" if operation in ("replace", "append") else "remove"
+        raise RuntimeError(
+            f"Error {action} description for {entity_urn}"
+            + (f" column {column_path}" if column_path else "")
+            + f": {str(e)}"
+        ) from e