datahub-agent-context 1.3.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datahub_agent_context/__init__.py +25 -0
- datahub_agent_context/_version.py +16 -0
- datahub_agent_context/context.py +97 -0
- datahub_agent_context/langchain_tools/__init__.py +8 -0
- datahub_agent_context/langchain_tools/builder.py +127 -0
- datahub_agent_context/mcp_tools/__init__.py +46 -0
- datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
- datahub_agent_context/mcp_tools/base.py +325 -0
- datahub_agent_context/mcp_tools/descriptions.py +299 -0
- datahub_agent_context/mcp_tools/documents.py +473 -0
- datahub_agent_context/mcp_tools/domains.py +246 -0
- datahub_agent_context/mcp_tools/entities.py +349 -0
- datahub_agent_context/mcp_tools/get_me.py +99 -0
- datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
- datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
- datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
- datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
- datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
- datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
- datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
- datahub_agent_context/mcp_tools/gql/search.gql +242 -0
- datahub_agent_context/mcp_tools/helpers.py +448 -0
- datahub_agent_context/mcp_tools/lineage.py +698 -0
- datahub_agent_context/mcp_tools/owners.py +318 -0
- datahub_agent_context/mcp_tools/queries.py +191 -0
- datahub_agent_context/mcp_tools/search.py +239 -0
- datahub_agent_context/mcp_tools/structured_properties.py +447 -0
- datahub_agent_context/mcp_tools/tags.py +296 -0
- datahub_agent_context/mcp_tools/terms.py +295 -0
- datahub_agent_context/py.typed +2 -0
- datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
- datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
- datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
- datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Copyright 2025 Acryl Data, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""DataHub Agent Context - MCP Tools for AI Agents."""
|
|
16
|
+
|
|
17
|
+
from datahub_agent_context._version import __version__
|
|
18
|
+
from datahub_agent_context.context import (
|
|
19
|
+
DataHubContext,
|
|
20
|
+
get_graph,
|
|
21
|
+
reset_graph,
|
|
22
|
+
set_graph,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = ["__version__", "DataHubContext", "get_graph", "set_graph", "reset_graph"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2025 Acryl Data, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
__package_name__ = "datahub-agent-context"
|
|
16
|
+
__version__ = "1.3.1.8"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Context management for DataHub tools.
|
|
2
|
+
|
|
3
|
+
This module provides a context manager pattern for managing DataHubGraph instances
|
|
4
|
+
across tool calls without explicit parameter passing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import contextvars
|
|
8
|
+
from typing import TYPE_CHECKING, Optional
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
12
|
+
|
|
13
|
+
# Context variable to store the current DataHubGraph instance
|
|
14
|
+
_graph_context: contextvars.ContextVar[Optional["DataHubGraph"]] = (
|
|
15
|
+
contextvars.ContextVar("datahub_graph", default=None)
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_graph() -> "DataHubGraph":
|
|
20
|
+
"""Get the current DataHubGraph from context.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
DataHubGraph instance from context
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
RuntimeError: If no graph is set in context
|
|
27
|
+
"""
|
|
28
|
+
graph = _graph_context.get()
|
|
29
|
+
if graph is None:
|
|
30
|
+
raise RuntimeError(
|
|
31
|
+
"No DataHubGraph in context. "
|
|
32
|
+
"Make sure to use DataHubContext context manager or set_graph() before calling tools."
|
|
33
|
+
)
|
|
34
|
+
return graph
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def set_graph(graph: "DataHubGraph") -> contextvars.Token:
|
|
38
|
+
"""Set the DataHubGraph in context.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
graph: DataHubGraph instance to set
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Token that can be used to reset the context
|
|
45
|
+
"""
|
|
46
|
+
return _graph_context.set(graph)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def reset_graph(token: contextvars.Token) -> None:
|
|
50
|
+
"""Reset the DataHubGraph context to its previous value.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
token: Token returned by set_graph()
|
|
54
|
+
"""
|
|
55
|
+
_graph_context.reset(token)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DataHubContext:
|
|
59
|
+
"""Context manager for DataHub tool execution.
|
|
60
|
+
|
|
61
|
+
This context manager sets the DataHubGraph in context for the duration
|
|
62
|
+
of the with block, allowing tools to access it without explicit parameter passing.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
from datahub.sdk.main_client import DataHubClient
|
|
66
|
+
from datahub_agent_context.context import DataHubContext
|
|
67
|
+
from datahub_agent_context.mcp_tools import search
|
|
68
|
+
|
|
69
|
+
client = DataHubClient(...)
|
|
70
|
+
|
|
71
|
+
with DataHubContext(client.graph):
|
|
72
|
+
results = search(query="users") # No graph parameter needed!
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, graph: "DataHubGraph"):
|
|
76
|
+
"""Initialize the context manager.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
graph: DataHubGraph instance to use in this context
|
|
80
|
+
"""
|
|
81
|
+
self.graph = graph
|
|
82
|
+
self._token: Optional[contextvars.Token] = None
|
|
83
|
+
|
|
84
|
+
def __enter__(self) -> "DataHubGraph":
|
|
85
|
+
"""Enter the context and set the graph.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The DataHubGraph instance
|
|
89
|
+
"""
|
|
90
|
+
self._token = set_graph(self.graph)
|
|
91
|
+
return self.graph
|
|
92
|
+
|
|
93
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
94
|
+
"""Exit the context and reset the graph."""
|
|
95
|
+
if self._token is not None:
|
|
96
|
+
reset_graph(self._token)
|
|
97
|
+
self._token = None
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""LangChain integration for DataHub Agent Context.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for building LangChain tools from DataHub MCP tools.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from datahub_agent_context.langchain_tools.builder import build_langchain_tools
|
|
7
|
+
|
|
8
|
+
__all__ = ["build_langchain_tools"]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Builder for LangChain tools from DataHub MCP tools."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from typing import TYPE_CHECKING, Callable
|
|
5
|
+
|
|
6
|
+
from datahub_agent_context.context import set_graph
|
|
7
|
+
from datahub_agent_context.mcp_tools import get_me
|
|
8
|
+
from datahub_agent_context.mcp_tools.documents import grep_documents, search_documents
|
|
9
|
+
from datahub_agent_context.mcp_tools.domains import remove_domains, set_domains
|
|
10
|
+
from datahub_agent_context.mcp_tools.structured_properties import (
|
|
11
|
+
add_structured_properties,
|
|
12
|
+
remove_structured_properties,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from datahub.sdk.main_client import DataHubClient
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from langchain_core.tools import tool # type: ignore[import-not-found]
|
|
20
|
+
from langchain_core.tools.base import BaseTool # type: ignore[import-not-found]
|
|
21
|
+
except ImportError as e:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
"langchain-core is required for LangChain tools. "
|
|
24
|
+
"Install with: pip install 'datahub-agent-context[langchain]'"
|
|
25
|
+
) from e
|
|
26
|
+
|
|
27
|
+
from datahub_agent_context.mcp_tools.descriptions import update_description
|
|
28
|
+
from datahub_agent_context.mcp_tools.entities import get_entities, list_schema_fields
|
|
29
|
+
from datahub_agent_context.mcp_tools.lineage import (
|
|
30
|
+
get_lineage,
|
|
31
|
+
get_lineage_paths_between,
|
|
32
|
+
)
|
|
33
|
+
from datahub_agent_context.mcp_tools.owners import add_owners, remove_owners
|
|
34
|
+
from datahub_agent_context.mcp_tools.queries import get_dataset_queries
|
|
35
|
+
from datahub_agent_context.mcp_tools.search import search
|
|
36
|
+
from datahub_agent_context.mcp_tools.tags import add_tags, remove_tags
|
|
37
|
+
from datahub_agent_context.mcp_tools.terms import (
|
|
38
|
+
add_glossary_terms,
|
|
39
|
+
remove_glossary_terms,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def create_context_wrapper(func: Callable, client: "DataHubClient") -> Callable:
|
|
44
|
+
"""Create a wrapper that sets DataHubGraph context before calling the function.
|
|
45
|
+
|
|
46
|
+
This wrapper uses contextvars to set the graph in context for the duration
|
|
47
|
+
of the function call, allowing the tool to retrieve it using get_graph().
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
func: The tool function that retrieves graph from context
|
|
51
|
+
client: DataHubClient instance whose graph will be set in context
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Wrapped function that sets context before execution
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
@functools.wraps(func)
|
|
58
|
+
def wrapper(*args, **kwargs):
|
|
59
|
+
# Set graph in context for this function call
|
|
60
|
+
token = set_graph(client._graph)
|
|
61
|
+
try:
|
|
62
|
+
return func(*args, **kwargs)
|
|
63
|
+
finally:
|
|
64
|
+
# Always reset context, even if function raises
|
|
65
|
+
from datahub_agent_context.context import reset_graph
|
|
66
|
+
|
|
67
|
+
reset_graph(token)
|
|
68
|
+
|
|
69
|
+
return wrapper
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def build_langchain_tools(
|
|
73
|
+
client: "DataHubClient",
|
|
74
|
+
include_mutations: bool = False,
|
|
75
|
+
) -> list[BaseTool]:
|
|
76
|
+
"""Build LangChain tools with automatic context management.
|
|
77
|
+
|
|
78
|
+
Each tool is wrapped to automatically set the DataHubGraph in context
|
|
79
|
+
before execution, allowing tools to retrieve it using get_graph().
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
client: DataHubClient instance
|
|
83
|
+
include_mutations: Whether to include mutation tools (default: False)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of LangChain BaseTool instances
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
from datahub.sdk.main_client import DataHubClient
|
|
90
|
+
from datahub_agent_context.langchain_tools import build_langchain_tools
|
|
91
|
+
|
|
92
|
+
client = DataHubClient(...)
|
|
93
|
+
tools = build_langchain_tools(client, include_mutations=True)
|
|
94
|
+
|
|
95
|
+
# Use with LangChain agents - context is managed automatically
|
|
96
|
+
agent = create_react_agent(llm, tools, prompt)
|
|
97
|
+
result = agent.invoke({"input": "search for datasets"})
|
|
98
|
+
"""
|
|
99
|
+
tools = []
|
|
100
|
+
|
|
101
|
+
# Wrap each tool to manage context automatically
|
|
102
|
+
tools.append(tool(create_context_wrapper(search_documents, client)))
|
|
103
|
+
tools.append(tool(create_context_wrapper(grep_documents, client)))
|
|
104
|
+
|
|
105
|
+
tools.append(tool(create_context_wrapper(get_entities, client)))
|
|
106
|
+
tools.append(tool(create_context_wrapper(list_schema_fields, client)))
|
|
107
|
+
tools.append(tool(create_context_wrapper(get_me, client)))
|
|
108
|
+
tools.append(tool(create_context_wrapper(get_lineage, client)))
|
|
109
|
+
tools.append(tool(create_context_wrapper(get_lineage_paths_between, client)))
|
|
110
|
+
|
|
111
|
+
tools.append(tool(create_context_wrapper(get_dataset_queries, client)))
|
|
112
|
+
tools.append(tool(create_context_wrapper(search, client)))
|
|
113
|
+
|
|
114
|
+
if include_mutations:
|
|
115
|
+
tools.append(tool(create_context_wrapper(update_description, client)))
|
|
116
|
+
tools.append(tool(create_context_wrapper(set_domains, client)))
|
|
117
|
+
tools.append(tool(create_context_wrapper(remove_domains, client)))
|
|
118
|
+
tools.append(tool(create_context_wrapper(add_owners, client)))
|
|
119
|
+
tools.append(tool(create_context_wrapper(remove_owners, client)))
|
|
120
|
+
tools.append(tool(create_context_wrapper(add_structured_properties, client)))
|
|
121
|
+
tools.append(tool(create_context_wrapper(remove_structured_properties, client)))
|
|
122
|
+
tools.append(tool(create_context_wrapper(add_tags, client)))
|
|
123
|
+
tools.append(tool(create_context_wrapper(remove_tags, client)))
|
|
124
|
+
tools.append(tool(create_context_wrapper(add_glossary_terms, client)))
|
|
125
|
+
tools.append(tool(create_context_wrapper(remove_glossary_terms, client)))
|
|
126
|
+
|
|
127
|
+
return tools
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""MCP tools for interacting with DataHub metadata."""
|
|
2
|
+
|
|
3
|
+
from datahub_agent_context.mcp_tools.descriptions import update_description
|
|
4
|
+
from datahub_agent_context.mcp_tools.documents import grep_documents, search_documents
|
|
5
|
+
from datahub_agent_context.mcp_tools.domains import remove_domains, set_domains
|
|
6
|
+
from datahub_agent_context.mcp_tools.entities import get_entities, list_schema_fields
|
|
7
|
+
from datahub_agent_context.mcp_tools.get_me import get_me
|
|
8
|
+
from datahub_agent_context.mcp_tools.lineage import (
|
|
9
|
+
get_lineage,
|
|
10
|
+
get_lineage_paths_between,
|
|
11
|
+
)
|
|
12
|
+
from datahub_agent_context.mcp_tools.owners import add_owners, remove_owners
|
|
13
|
+
from datahub_agent_context.mcp_tools.queries import get_dataset_queries
|
|
14
|
+
from datahub_agent_context.mcp_tools.search import search
|
|
15
|
+
from datahub_agent_context.mcp_tools.structured_properties import (
|
|
16
|
+
add_structured_properties,
|
|
17
|
+
remove_structured_properties,
|
|
18
|
+
)
|
|
19
|
+
from datahub_agent_context.mcp_tools.tags import add_tags, remove_tags
|
|
20
|
+
from datahub_agent_context.mcp_tools.terms import (
|
|
21
|
+
add_glossary_terms,
|
|
22
|
+
remove_glossary_terms,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"search",
|
|
27
|
+
"get_entities",
|
|
28
|
+
"list_schema_fields",
|
|
29
|
+
"get_lineage",
|
|
30
|
+
"get_lineage_paths_between",
|
|
31
|
+
"get_dataset_queries",
|
|
32
|
+
"search_documents",
|
|
33
|
+
"grep_documents",
|
|
34
|
+
"add_tags",
|
|
35
|
+
"remove_tags",
|
|
36
|
+
"update_description",
|
|
37
|
+
"set_domains",
|
|
38
|
+
"remove_domains",
|
|
39
|
+
"add_owners",
|
|
40
|
+
"remove_owners",
|
|
41
|
+
"add_glossary_terms",
|
|
42
|
+
"remove_glossary_terms",
|
|
43
|
+
"add_structured_properties",
|
|
44
|
+
"remove_structured_properties",
|
|
45
|
+
"get_me",
|
|
46
|
+
]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Token count estimation utilities for MCP responses."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TokenCountEstimator:
|
|
10
|
+
"""Fast token estimation for MCP response budget management.
|
|
11
|
+
|
|
12
|
+
Uses character-based heuristics instead of actual tokenization for performance.
|
|
13
|
+
Accuracy is sufficient given the 90% budget buffer used in practice.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def estimate_dict_tokens(
|
|
18
|
+
obj: Union[dict, list, str, int, float, bool, None],
|
|
19
|
+
) -> int:
|
|
20
|
+
"""Fast approximation of token count for dict/list structures.
|
|
21
|
+
|
|
22
|
+
Recursively walks structure counting characters. Much faster than json.dumps + estimate_tokens.
|
|
23
|
+
|
|
24
|
+
IMPORTANT: Assumes no circular references in the structure.
|
|
25
|
+
Protected against infinite recursion with MAX_DEPTH=100.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
obj: Dict, list, or primitive value (must not contain circular references)
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Approximate token count
|
|
32
|
+
"""
|
|
33
|
+
MAX_DEPTH = 100
|
|
34
|
+
|
|
35
|
+
def _count_chars(item, depth: int = 0) -> int:
|
|
36
|
+
if depth > MAX_DEPTH:
|
|
37
|
+
logger.error(
|
|
38
|
+
f"Max depth {MAX_DEPTH} exceeded in structure, stopping recursion"
|
|
39
|
+
)
|
|
40
|
+
return 0
|
|
41
|
+
|
|
42
|
+
if item is None:
|
|
43
|
+
return 4 # "null"
|
|
44
|
+
elif isinstance(item, bool):
|
|
45
|
+
return 5 # "true" or "false"
|
|
46
|
+
elif isinstance(item, str):
|
|
47
|
+
# Account for:
|
|
48
|
+
# - Quotes around string values: "value" → +6
|
|
49
|
+
# - Escape characters (\n, \", \\, etc.) → +10% of length
|
|
50
|
+
# Structural chars weighted heavier as they often tokenize separately
|
|
51
|
+
base_length = len(item)
|
|
52
|
+
escape_overhead = int(base_length * 0.1)
|
|
53
|
+
return base_length + 6 + escape_overhead
|
|
54
|
+
elif isinstance(item, (int, float)):
|
|
55
|
+
return 6 # Average number length
|
|
56
|
+
elif isinstance(item, list):
|
|
57
|
+
return sum(_count_chars(elem, depth + 1) for elem in item) + len(item)
|
|
58
|
+
elif isinstance(item, dict):
|
|
59
|
+
total = 0
|
|
60
|
+
for key, value in item.items():
|
|
61
|
+
# Account for: "key": value, → 2 quotes + colon + space + comma
|
|
62
|
+
# Structural chars weighted heavier (often separate tokens)
|
|
63
|
+
total += len(str(key)) + 9
|
|
64
|
+
total += _count_chars(value, depth + 1)
|
|
65
|
+
return total + len(item) # Additional padding for structure
|
|
66
|
+
else:
|
|
67
|
+
return 10 # Fallback for other types
|
|
68
|
+
|
|
69
|
+
chars = _count_chars(obj, depth=0)
|
|
70
|
+
# Use same formula as estimate_tokens for consistency
|
|
71
|
+
return int(1.3 * chars / 4)
|