datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. datahub_agent_context/__init__.py +25 -0
  2. datahub_agent_context/_version.py +16 -0
  3. datahub_agent_context/context.py +97 -0
  4. datahub_agent_context/langchain_tools/__init__.py +8 -0
  5. datahub_agent_context/langchain_tools/builder.py +127 -0
  6. datahub_agent_context/mcp_tools/__init__.py +46 -0
  7. datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
  8. datahub_agent_context/mcp_tools/base.py +325 -0
  9. datahub_agent_context/mcp_tools/descriptions.py +299 -0
  10. datahub_agent_context/mcp_tools/documents.py +473 -0
  11. datahub_agent_context/mcp_tools/domains.py +246 -0
  12. datahub_agent_context/mcp_tools/entities.py +349 -0
  13. datahub_agent_context/mcp_tools/get_me.py +99 -0
  14. datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
  15. datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
  16. datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
  17. datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
  18. datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
  19. datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
  20. datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
  21. datahub_agent_context/mcp_tools/gql/search.gql +242 -0
  22. datahub_agent_context/mcp_tools/helpers.py +448 -0
  23. datahub_agent_context/mcp_tools/lineage.py +698 -0
  24. datahub_agent_context/mcp_tools/owners.py +318 -0
  25. datahub_agent_context/mcp_tools/queries.py +191 -0
  26. datahub_agent_context/mcp_tools/search.py +239 -0
  27. datahub_agent_context/mcp_tools/structured_properties.py +447 -0
  28. datahub_agent_context/mcp_tools/tags.py +296 -0
  29. datahub_agent_context/mcp_tools/terms.py +295 -0
  30. datahub_agent_context/py.typed +2 -0
  31. datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
  32. datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
  33. datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
  34. datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,25 @@
1
+ # Copyright 2025 Acryl Data, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """DataHub Agent Context - MCP Tools for AI Agents."""
16
+
17
+ from datahub_agent_context._version import __version__
18
+ from datahub_agent_context.context import (
19
+ DataHubContext,
20
+ get_graph,
21
+ reset_graph,
22
+ set_graph,
23
+ )
24
+
25
+ __all__ = ["__version__", "DataHubContext", "get_graph", "set_graph", "reset_graph"]
@@ -0,0 +1,16 @@
1
+ # Copyright 2025 Acryl Data, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __package_name__ = "datahub-agent-context"
16
+ __version__ = "1.3.1.8"
@@ -0,0 +1,97 @@
1
+ """Context management for DataHub tools.
2
+
3
+ This module provides a context manager pattern for managing DataHubGraph instances
4
+ across tool calls without explicit parameter passing.
5
+ """
6
+
7
+ import contextvars
8
+ from typing import TYPE_CHECKING, Optional
9
+
10
+ if TYPE_CHECKING:
11
+ from datahub.ingestion.graph.client import DataHubGraph
12
+
13
+ # Context variable to store the current DataHubGraph instance
14
+ _graph_context: contextvars.ContextVar[Optional["DataHubGraph"]] = (
15
+ contextvars.ContextVar("datahub_graph", default=None)
16
+ )
17
+
18
+
19
+ def get_graph() -> "DataHubGraph":
20
+ """Get the current DataHubGraph from context.
21
+
22
+ Returns:
23
+ DataHubGraph instance from context
24
+
25
+ Raises:
26
+ RuntimeError: If no graph is set in context
27
+ """
28
+ graph = _graph_context.get()
29
+ if graph is None:
30
+ raise RuntimeError(
31
+ "No DataHubGraph in context. "
32
+ "Make sure to use DataHubContext context manager or set_graph() before calling tools."
33
+ )
34
+ return graph
35
+
36
+
37
+ def set_graph(graph: "DataHubGraph") -> contextvars.Token:
38
+ """Set the DataHubGraph in context.
39
+
40
+ Args:
41
+ graph: DataHubGraph instance to set
42
+
43
+ Returns:
44
+ Token that can be used to reset the context
45
+ """
46
+ return _graph_context.set(graph)
47
+
48
+
49
+ def reset_graph(token: contextvars.Token) -> None:
50
+ """Reset the DataHubGraph context to its previous value.
51
+
52
+ Args:
53
+ token: Token returned by set_graph()
54
+ """
55
+ _graph_context.reset(token)
56
+
57
+
58
+ class DataHubContext:
59
+ """Context manager for DataHub tool execution.
60
+
61
+ This context manager sets the DataHubGraph in context for the duration
62
+ of the with block, allowing tools to access it without explicit parameter passing.
63
+
64
+ Example:
65
+ from datahub.sdk.main_client import DataHubClient
66
+ from datahub_agent_context.context import DataHubContext
67
+ from datahub_agent_context.mcp_tools import search
68
+
69
+ client = DataHubClient(...)
70
+
71
+ with DataHubContext(client.graph):
72
+ results = search(query="users") # No graph parameter needed!
73
+ """
74
+
75
+ def __init__(self, graph: "DataHubGraph"):
76
+ """Initialize the context manager.
77
+
78
+ Args:
79
+ graph: DataHubGraph instance to use in this context
80
+ """
81
+ self.graph = graph
82
+ self._token: Optional[contextvars.Token] = None
83
+
84
+ def __enter__(self) -> "DataHubGraph":
85
+ """Enter the context and set the graph.
86
+
87
+ Returns:
88
+ The DataHubGraph instance
89
+ """
90
+ self._token = set_graph(self.graph)
91
+ return self.graph
92
+
93
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
94
+ """Exit the context and reset the graph."""
95
+ if self._token is not None:
96
+ reset_graph(self._token)
97
+ self._token = None
@@ -0,0 +1,8 @@
1
+ """LangChain integration for DataHub Agent Context.
2
+
3
+ This module provides utilities for building LangChain tools from DataHub MCP tools.
4
+ """
5
+
6
+ from datahub_agent_context.langchain_tools.builder import build_langchain_tools
7
+
8
+ __all__ = ["build_langchain_tools"]
@@ -0,0 +1,127 @@
1
+ """Builder for LangChain tools from DataHub MCP tools."""
2
+
3
+ import functools
4
+ from typing import TYPE_CHECKING, Callable
5
+
6
+ from datahub_agent_context.context import set_graph
7
+ from datahub_agent_context.mcp_tools import get_me
8
+ from datahub_agent_context.mcp_tools.documents import grep_documents, search_documents
9
+ from datahub_agent_context.mcp_tools.domains import remove_domains, set_domains
10
+ from datahub_agent_context.mcp_tools.structured_properties import (
11
+ add_structured_properties,
12
+ remove_structured_properties,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ from datahub.sdk.main_client import DataHubClient
17
+
18
+ try:
19
+ from langchain_core.tools import tool # type: ignore[import-not-found]
20
+ from langchain_core.tools.base import BaseTool # type: ignore[import-not-found]
21
+ except ImportError as e:
22
+ raise ImportError(
23
+ "langchain-core is required for LangChain tools. "
24
+ "Install with: pip install 'datahub-agent-context[langchain]'"
25
+ ) from e
26
+
27
+ from datahub_agent_context.mcp_tools.descriptions import update_description
28
+ from datahub_agent_context.mcp_tools.entities import get_entities, list_schema_fields
29
+ from datahub_agent_context.mcp_tools.lineage import (
30
+ get_lineage,
31
+ get_lineage_paths_between,
32
+ )
33
+ from datahub_agent_context.mcp_tools.owners import add_owners, remove_owners
34
+ from datahub_agent_context.mcp_tools.queries import get_dataset_queries
35
+ from datahub_agent_context.mcp_tools.search import search
36
+ from datahub_agent_context.mcp_tools.tags import add_tags, remove_tags
37
+ from datahub_agent_context.mcp_tools.terms import (
38
+ add_glossary_terms,
39
+ remove_glossary_terms,
40
+ )
41
+
42
+
43
+ def create_context_wrapper(func: Callable, client: "DataHubClient") -> Callable:
44
+ """Create a wrapper that sets DataHubGraph context before calling the function.
45
+
46
+ This wrapper uses contextvars to set the graph in context for the duration
47
+ of the function call, allowing the tool to retrieve it using get_graph().
48
+
49
+ Args:
50
+ func: The tool function that retrieves graph from context
51
+ client: DataHubClient instance whose graph will be set in context
52
+
53
+ Returns:
54
+ Wrapped function that sets context before execution
55
+ """
56
+
57
+ @functools.wraps(func)
58
+ def wrapper(*args, **kwargs):
59
+ # Set graph in context for this function call
60
+ token = set_graph(client._graph)
61
+ try:
62
+ return func(*args, **kwargs)
63
+ finally:
64
+ # Always reset context, even if function raises
65
+ from datahub_agent_context.context import reset_graph
66
+
67
+ reset_graph(token)
68
+
69
+ return wrapper
70
+
71
+
72
+ def build_langchain_tools(
73
+ client: "DataHubClient",
74
+ include_mutations: bool = False,
75
+ ) -> list[BaseTool]:
76
+ """Build LangChain tools with automatic context management.
77
+
78
+ Each tool is wrapped to automatically set the DataHubGraph in context
79
+ before execution, allowing tools to retrieve it using get_graph().
80
+
81
+ Args:
82
+ client: DataHubClient instance
83
+ include_mutations: Whether to include mutation tools (default: False)
84
+
85
+ Returns:
86
+ List of LangChain BaseTool instances
87
+
88
+ Example:
89
+ from datahub.sdk.main_client import DataHubClient
90
+ from datahub_agent_context.langchain_tools import build_langchain_tools
91
+
92
+ client = DataHubClient(...)
93
+ tools = build_langchain_tools(client, include_mutations=True)
94
+
95
+ # Use with LangChain agents - context is managed automatically
96
+ agent = create_react_agent(llm, tools, prompt)
97
+ result = agent.invoke({"input": "search for datasets"})
98
+ """
99
+ tools = []
100
+
101
+ # Wrap each tool to manage context automatically
102
+ tools.append(tool(create_context_wrapper(search_documents, client)))
103
+ tools.append(tool(create_context_wrapper(grep_documents, client)))
104
+
105
+ tools.append(tool(create_context_wrapper(get_entities, client)))
106
+ tools.append(tool(create_context_wrapper(list_schema_fields, client)))
107
+ tools.append(tool(create_context_wrapper(get_me, client)))
108
+ tools.append(tool(create_context_wrapper(get_lineage, client)))
109
+ tools.append(tool(create_context_wrapper(get_lineage_paths_between, client)))
110
+
111
+ tools.append(tool(create_context_wrapper(get_dataset_queries, client)))
112
+ tools.append(tool(create_context_wrapper(search, client)))
113
+
114
+ if include_mutations:
115
+ tools.append(tool(create_context_wrapper(update_description, client)))
116
+ tools.append(tool(create_context_wrapper(set_domains, client)))
117
+ tools.append(tool(create_context_wrapper(remove_domains, client)))
118
+ tools.append(tool(create_context_wrapper(add_owners, client)))
119
+ tools.append(tool(create_context_wrapper(remove_owners, client)))
120
+ tools.append(tool(create_context_wrapper(add_structured_properties, client)))
121
+ tools.append(tool(create_context_wrapper(remove_structured_properties, client)))
122
+ tools.append(tool(create_context_wrapper(add_tags, client)))
123
+ tools.append(tool(create_context_wrapper(remove_tags, client)))
124
+ tools.append(tool(create_context_wrapper(add_glossary_terms, client)))
125
+ tools.append(tool(create_context_wrapper(remove_glossary_terms, client)))
126
+
127
+ return tools
@@ -0,0 +1,46 @@
1
+ """MCP tools for interacting with DataHub metadata."""
2
+
3
+ from datahub_agent_context.mcp_tools.descriptions import update_description
4
+ from datahub_agent_context.mcp_tools.documents import grep_documents, search_documents
5
+ from datahub_agent_context.mcp_tools.domains import remove_domains, set_domains
6
+ from datahub_agent_context.mcp_tools.entities import get_entities, list_schema_fields
7
+ from datahub_agent_context.mcp_tools.get_me import get_me
8
+ from datahub_agent_context.mcp_tools.lineage import (
9
+ get_lineage,
10
+ get_lineage_paths_between,
11
+ )
12
+ from datahub_agent_context.mcp_tools.owners import add_owners, remove_owners
13
+ from datahub_agent_context.mcp_tools.queries import get_dataset_queries
14
+ from datahub_agent_context.mcp_tools.search import search
15
+ from datahub_agent_context.mcp_tools.structured_properties import (
16
+ add_structured_properties,
17
+ remove_structured_properties,
18
+ )
19
+ from datahub_agent_context.mcp_tools.tags import add_tags, remove_tags
20
+ from datahub_agent_context.mcp_tools.terms import (
21
+ add_glossary_terms,
22
+ remove_glossary_terms,
23
+ )
24
+
25
+ __all__ = [
26
+ "search",
27
+ "get_entities",
28
+ "list_schema_fields",
29
+ "get_lineage",
30
+ "get_lineage_paths_between",
31
+ "get_dataset_queries",
32
+ "search_documents",
33
+ "grep_documents",
34
+ "add_tags",
35
+ "remove_tags",
36
+ "update_description",
37
+ "set_domains",
38
+ "remove_domains",
39
+ "add_owners",
40
+ "remove_owners",
41
+ "add_glossary_terms",
42
+ "remove_glossary_terms",
43
+ "add_structured_properties",
44
+ "remove_structured_properties",
45
+ "get_me",
46
+ ]
@@ -0,0 +1,71 @@
1
+ """Token count estimation utilities for MCP responses."""
2
+
3
+ import logging
4
+ from typing import Union
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TokenCountEstimator:
10
+ """Fast token estimation for MCP response budget management.
11
+
12
+ Uses character-based heuristics instead of actual tokenization for performance.
13
+ Accuracy is sufficient given the 90% budget buffer used in practice.
14
+ """
15
+
16
+ @staticmethod
17
+ def estimate_dict_tokens(
18
+ obj: Union[dict, list, str, int, float, bool, None],
19
+ ) -> int:
20
+ """Fast approximation of token count for dict/list structures.
21
+
22
+ Recursively walks structure counting characters. Much faster than json.dumps + estimate_tokens.
23
+
24
+ IMPORTANT: Assumes no circular references in the structure.
25
+ Protected against infinite recursion with MAX_DEPTH=100.
26
+
27
+ Args:
28
+ obj: Dict, list, or primitive value (must not contain circular references)
29
+
30
+ Returns:
31
+ Approximate token count
32
+ """
33
+ MAX_DEPTH = 100
34
+
35
+ def _count_chars(item, depth: int = 0) -> int:
36
+ if depth > MAX_DEPTH:
37
+ logger.error(
38
+ f"Max depth {MAX_DEPTH} exceeded in structure, stopping recursion"
39
+ )
40
+ return 0
41
+
42
+ if item is None:
43
+ return 4 # "null"
44
+ elif isinstance(item, bool):
45
+ return 5 # "true" or "false"
46
+ elif isinstance(item, str):
47
+ # Account for:
48
+ # - Quotes around string values: "value" → +6
49
+ # - Escape characters (\n, \", \\, etc.) → +10% of length
50
+ # Structural chars weighted heavier as they often tokenize separately
51
+ base_length = len(item)
52
+ escape_overhead = int(base_length * 0.1)
53
+ return base_length + 6 + escape_overhead
54
+ elif isinstance(item, (int, float)):
55
+ return 6 # Average number length
56
+ elif isinstance(item, list):
57
+ return sum(_count_chars(elem, depth + 1) for elem in item) + len(item)
58
+ elif isinstance(item, dict):
59
+ total = 0
60
+ for key, value in item.items():
61
+ # Account for: "key": value, → 2 quotes + colon + space + comma
62
+ # Structural chars weighted heavier (often separate tokens)
63
+ total += len(str(key)) + 9
64
+ total += _count_chars(value, depth + 1)
65
+ return total + len(item) # Additional padding for structure
66
+ else:
67
+ return 10 # Fallback for other types
68
+
69
+ chars = _count_chars(obj, depth=0)
70
+ # Use same formula as estimate_tokens for consistency
71
+ return int(1.3 * chars / 4)