datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. datahub_agent_context/__init__.py +25 -0
  2. datahub_agent_context/_version.py +16 -0
  3. datahub_agent_context/context.py +97 -0
  4. datahub_agent_context/langchain_tools/__init__.py +8 -0
  5. datahub_agent_context/langchain_tools/builder.py +127 -0
  6. datahub_agent_context/mcp_tools/__init__.py +46 -0
  7. datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
  8. datahub_agent_context/mcp_tools/base.py +325 -0
  9. datahub_agent_context/mcp_tools/descriptions.py +299 -0
  10. datahub_agent_context/mcp_tools/documents.py +473 -0
  11. datahub_agent_context/mcp_tools/domains.py +246 -0
  12. datahub_agent_context/mcp_tools/entities.py +349 -0
  13. datahub_agent_context/mcp_tools/get_me.py +99 -0
  14. datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
  15. datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
  16. datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
  17. datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
  18. datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
  19. datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
  20. datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
  21. datahub_agent_context/mcp_tools/gql/search.gql +242 -0
  22. datahub_agent_context/mcp_tools/helpers.py +448 -0
  23. datahub_agent_context/mcp_tools/lineage.py +698 -0
  24. datahub_agent_context/mcp_tools/owners.py +318 -0
  25. datahub_agent_context/mcp_tools/queries.py +191 -0
  26. datahub_agent_context/mcp_tools/search.py +239 -0
  27. datahub_agent_context/mcp_tools/structured_properties.py +447 -0
  28. datahub_agent_context/mcp_tools/tags.py +296 -0
  29. datahub_agent_context/mcp_tools/terms.py +295 -0
  30. datahub_agent_context/py.typed +2 -0
  31. datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
  32. datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
  33. datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
  34. datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,318 @@
1
+ """Owner management tools for DataHub MCP server."""
2
+
3
+ import logging
4
+ from typing import List, Literal, Optional
5
+
6
+ from datahub_agent_context.context import get_graph
7
+ from datahub_agent_context.mcp_tools.base import execute_graphql
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _validate_owner_urns(owner_urns: List[str]) -> None:
13
+ """
14
+ Validate that all owner URNs exist in DataHub and are either CorpUser or CorpGroup entities.
15
+
16
+ Raises:
17
+ ValueError: If any owner URN does not exist or is not a valid owner entity type
18
+ """
19
+ graph = get_graph()
20
+ # Query to check if owners exist and are valid types
21
+ query = """
22
+ query getOwners($urns: [String!]!) {
23
+ entities(urns: $urns) {
24
+ urn
25
+ type
26
+ ... on CorpUser {
27
+ username
28
+ }
29
+ ... on CorpGroup {
30
+ name
31
+ }
32
+ }
33
+ }
34
+ """
35
+
36
+ try:
37
+ result = execute_graphql(
38
+ graph,
39
+ query=query,
40
+ variables={"urns": owner_urns},
41
+ operation_name="getOwners",
42
+ )
43
+
44
+ entities = result.get("entities", [])
45
+
46
+ # Build a map of found URNs
47
+ found_urns = {entity["urn"] for entity in entities if entity is not None}
48
+
49
+ # Check for missing owners
50
+ missing_urns = [urn for urn in owner_urns if urn not in found_urns]
51
+
52
+ if missing_urns:
53
+ raise ValueError(
54
+ f"The following owner URNs do not exist in DataHub: {', '.join(missing_urns)}. "
55
+ f"Please use the search tool with entity_type filter to find existing users or groups, "
56
+ f"or create the owners first before assigning them."
57
+ )
58
+
59
+ # Verify all returned entities are either CorpUser or CorpGroup
60
+ invalid_type_entities = [
61
+ entity["urn"]
62
+ for entity in entities
63
+ if entity and entity.get("type") not in ("CORP_USER", "CORP_GROUP")
64
+ ]
65
+ if invalid_type_entities:
66
+ raise ValueError(
67
+ f"The following URNs are not valid owner entities (must be CorpUser or CorpGroup): {', '.join(invalid_type_entities)}"
68
+ )
69
+
70
+ except Exception as e:
71
+ if isinstance(e, ValueError):
72
+ raise
73
+ raise ValueError(f"Failed to validate owner URNs: {str(e)}") from e
74
+
75
+
76
+ def _batch_modify_owners(
77
+ owner_urns: List[str],
78
+ entity_urns: List[str],
79
+ ownership_type_urn: Optional[str],
80
+ operation: Literal["add", "remove"],
81
+ ) -> dict:
82
+ """
83
+ Internal helper for batch owner operations (add/remove).
84
+
85
+ Validates inputs, constructs GraphQL mutation, and executes the operation.
86
+ """
87
+ graph = get_graph()
88
+ # Validate inputs
89
+ if not owner_urns:
90
+ raise ValueError("owner_urns cannot be empty")
91
+ if not entity_urns:
92
+ raise ValueError("entity_urns cannot be empty")
93
+
94
+ # Validate that all owner URNs exist and are valid types
95
+ _validate_owner_urns(owner_urns)
96
+
97
+ # Build the resources list for GraphQL mutation
98
+ resources = []
99
+ for resource_urn in entity_urns:
100
+ resource_input = {"resourceUrn": resource_urn}
101
+ resources.append(resource_input)
102
+
103
+ # Determine mutation and operation name based on operation type
104
+ if operation == "add":
105
+ # For adding owners, we need to include ownerEntityType
106
+ # Determine owner entity types from URNs
107
+ owners = []
108
+ for owner_urn in owner_urns:
109
+ owner_entity_type = (
110
+ "CORP_USER" if ":corpuser:" in owner_urn.lower() else "CORP_GROUP"
111
+ )
112
+ owner_input: dict = {
113
+ "ownerUrn": owner_urn,
114
+ "ownerEntityType": owner_entity_type,
115
+ }
116
+ # Add ownership type if provided
117
+ if ownership_type_urn:
118
+ owner_input["ownershipTypeUrn"] = ownership_type_urn
119
+
120
+ owners.append(owner_input)
121
+
122
+ mutation = """
123
+ mutation batchAddOwners($input: BatchAddOwnersInput!) {
124
+ batchAddOwners(input: $input)
125
+ }
126
+ """
127
+ add_input: dict = {
128
+ "owners": owners,
129
+ "resources": resources,
130
+ }
131
+ if ownership_type_urn:
132
+ add_input["ownershipTypeUrn"] = ownership_type_urn
133
+
134
+ variables = {"input": add_input}
135
+
136
+ operation_name = "batchAddOwners"
137
+ success_verb = "added"
138
+ failure_verb = "add"
139
+ else: # remove
140
+ mutation = """
141
+ mutation batchRemoveOwners($input: BatchRemoveOwnersInput!) {
142
+ batchRemoveOwners(input: $input)
143
+ }
144
+ """
145
+ remove_input: dict = {
146
+ "ownerUrns": owner_urns,
147
+ "resources": resources,
148
+ }
149
+ if ownership_type_urn:
150
+ remove_input["ownershipTypeUrn"] = ownership_type_urn
151
+
152
+ variables = {"input": remove_input}
153
+
154
+ operation_name = "batchRemoveOwners"
155
+ success_verb = "removed"
156
+ failure_verb = "remove"
157
+
158
+ try:
159
+ result = execute_graphql(
160
+ graph,
161
+ query=mutation,
162
+ variables=variables,
163
+ operation_name=operation_name,
164
+ )
165
+
166
+ success = result.get(operation_name, False)
167
+ if success:
168
+ preposition = "to" if operation == "add" else "from"
169
+ return {
170
+ "success": True,
171
+ "message": f"Successfully {success_verb} {len(owner_urns)} owner(s) {preposition} {len(entity_urns)} entit(ies)",
172
+ }
173
+ else:
174
+ raise RuntimeError(
175
+ f"Failed to {failure_verb} owners - operation returned false"
176
+ )
177
+
178
+ except Exception as e:
179
+ if isinstance(e, RuntimeError):
180
+ raise
181
+ raise RuntimeError(f"Error {failure_verb} owners: {str(e)}") from e
182
+
183
+
184
+ def add_owners(
185
+ owner_urns: List[str],
186
+ entity_urns: List[str],
187
+ ownership_type_urn: Optional[str] = None,
188
+ ) -> dict:
189
+ """Add one or more owners to multiple DataHub entities.
190
+
191
+ This tool allows you to assign multiple entities with multiple owners in a single operation.
192
+ Useful for bulk ownership assignment operations like assigning data stewards, technical owners,
193
+ or business owners to datasets, dashboards, and other DataHub entities.
194
+
195
+ Note: Ownership in DataHub is entity-level only. For field-level metadata, use tags or glossary terms instead.
196
+
197
+ Args:
198
+ owner_urns: List of owner URNs to add (must be CorpUser or CorpGroup URNs).
199
+ Examples: ["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"]
200
+ entity_urns: List of entity URNs to assign ownership to (e.g., dataset URNs, dashboard URNs)
201
+ ownership_type_urn: Optional ownership type URN to specify the type of ownership
202
+ (e.g., "urn:li:ownershipType:dataowner", "urn:li:ownershipType:technical_owner").
203
+ If not provided, ownership type will be set based on the mutation default.
204
+
205
+ Returns:
206
+ Dictionary with:
207
+ - success: Boolean indicating if the operation succeeded
208
+ - message: Success or error message
209
+
210
+ Examples:
211
+ # Add owners to multiple datasets
212
+ add_owners(
213
+ owner_urns=["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"],
214
+ entity_urns=[
215
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
216
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
217
+ ]
218
+ )
219
+
220
+ # Add technical owner with specific ownership type
221
+ add_owners(
222
+ owner_urns=["urn:li:corpuser:jane.smith"],
223
+ entity_urns=[
224
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
225
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
226
+ ],
227
+ ownership_type_urn="urn:li:ownershipType:technical_owner"
228
+ )
229
+
230
+ # Add data owner to multiple entities
231
+ add_owners(
232
+ owner_urns=["urn:li:corpuser:data.steward"],
233
+ entity_urns=[
234
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.sales,PROD)",
235
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.transactions,PROD)",
236
+ "urn:li:dashboard:(urn:li:dataPlatform:looker,sales_dashboard,PROD)"
237
+ ],
238
+ ownership_type_urn="urn:li:ownershipType:dataowner"
239
+ )
240
+
241
+ Example:
242
+ from datahub_agent_context.context import DataHubContext
243
+
244
+ with DataHubContext(client.graph):
245
+ result = add_owners(
246
+ owner_urns=["urn:li:corpuser:john.doe"],
247
+ entity_urns=["urn:li:dataset:(...)"]
248
+ )
249
+ """
250
+ return _batch_modify_owners(owner_urns, entity_urns, ownership_type_urn, "add")
251
+
252
+
253
+ def remove_owners(
254
+ owner_urns: List[str],
255
+ entity_urns: List[str],
256
+ ownership_type_urn: Optional[str] = None,
257
+ ) -> dict:
258
+ """Remove one or more owners from multiple DataHub entities.
259
+
260
+ This tool allows you to unassign multiple entities from multiple owners in a single operation.
261
+ Useful for bulk ownership removal operations like removing owners when they change roles,
262
+ cleaning up stale ownership, or correcting misassigned ownership.
263
+
264
+ Note: Ownership in DataHub is entity-level only. For field-level metadata, use tags or glossary terms instead.
265
+
266
+ Args:
267
+ owner_urns: List of owner URNs to remove (must be CorpUser or CorpGroup URNs).
268
+ Examples: ["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"]
269
+ entity_urns: List of entity URNs to remove ownership from (e.g., dataset URNs, dashboard URNs)
270
+ ownership_type_urn: Optional ownership type URN to specify which type of ownership to remove
271
+ (e.g., "urn:li:ownershipType:dataowner").
272
+ If not provided, will remove ownership regardless of type.
273
+
274
+ Returns:
275
+ Dictionary with:
276
+ - success: Boolean indicating if the operation succeeded
277
+ - message: Success or error message
278
+
279
+ Examples:
280
+ # Remove owners from multiple datasets
281
+ remove_owners(
282
+ owner_urns=["urn:li:corpuser:former.employee", "urn:li:corpGroup:old-team"],
283
+ entity_urns=[
284
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
285
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
286
+ ]
287
+ )
288
+
289
+ # Remove technical owner with specific ownership type
290
+ remove_owners(
291
+ owner_urns=["urn:li:corpuser:john.doe"],
292
+ entity_urns=[
293
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
294
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
295
+ ],
296
+ ownership_type_urn="urn:li:ownershipType:technical_owner"
297
+ )
298
+
299
+ # Remove temporary owner from multiple entities
300
+ remove_owners(
301
+ owner_urns=["urn:li:corpuser:temp.owner"],
302
+ entity_urns=[
303
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.stable_table,PROD)",
304
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
305
+ "urn:li:dashboard:(urn:li:dataPlatform:looker,temp_dashboard,PROD)"
306
+ ]
307
+ )
308
+
309
+ Example:
310
+ from datahub_agent_context.context import DataHubContext
311
+
312
+ with DataHubContext(client.graph):
313
+ result = remove_owners(
314
+ owner_urns=["urn:li:corpuser:former.employee"],
315
+ entity_urns=["urn:li:dataset:(...)"]
316
+ )
317
+ """
318
+ return _batch_modify_owners(owner_urns, entity_urns, ownership_type_urn, "remove")
@@ -0,0 +1,191 @@
1
+ """Tools for getting dataset queries."""
2
+
3
+ import contextlib
4
+ import logging
5
+ import pathlib
6
+ from typing import Literal, Optional
7
+
8
+ from datahub.sdk.search_client import compile_filters
9
+ from datahub.sdk.search_filters import FilterDsl
10
+ from datahub.utilities.ordered_set import OrderedSet
11
+ from datahub_agent_context.context import get_graph
12
+ from datahub_agent_context.mcp_tools.base import clean_gql_response, execute_graphql
13
+ from datahub_agent_context.mcp_tools.helpers import (
14
+ maybe_convert_to_schema_field_urn,
15
+ truncate_query,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Load GraphQL query
21
+ queries_gql = (pathlib.Path(__file__).parent / "gql/queries.gql").read_text()
22
+
23
+
24
+ def _deduplicate_subjects(subjects: list[dict]) -> list[str]:
25
+ """Deduplicate subjects to unique dataset URNs.
26
+
27
+ The "subjects" field returns every dataset and schema field associated with the query.
28
+ While this is useful for our backend to have, it's not useful here because
29
+ we can just look at the query directly. So we'll narrow it down to the unique
30
+ list of dataset urns.
31
+
32
+ Args:
33
+ subjects: List of subject dicts with dataset/schemaField info
34
+
35
+ Returns:
36
+ List of unique dataset URNs
37
+ """
38
+ updated_subjects: OrderedSet[str] = OrderedSet()
39
+ for subject in subjects:
40
+ with contextlib.suppress(KeyError):
41
+ updated_subjects.add(subject["dataset"]["urn"])
42
+ return list(updated_subjects)
43
+
44
+
45
+ def get_dataset_queries(
46
+ urn: str,
47
+ column: Optional[str] = None,
48
+ source: Optional[Literal["MANUAL", "SYSTEM"]] = None,
49
+ start: int = 0,
50
+ count: int = 10,
51
+ ) -> dict:
52
+ """Get SQL queries associated with a dataset or column to understand usage patterns.
53
+
54
+ This tool retrieves actual SQL queries that reference a specific dataset or column.
55
+ Useful for understanding how data is used, common JOIN patterns, typical filters,
56
+ and aggregation logic.
57
+
58
+ Args:
59
+ urn: Dataset URN
60
+ column: Optional column name to filter queries
61
+ source: Filter by query origin:
62
+ - "MANUAL": Queries written by users in query editors (real SQL patterns)
63
+ - "SYSTEM": Queries extracted from BI tools/dashboards (production usage)
64
+ - None: Return both types (default)
65
+ start: Starting offset for pagination (default: 0)
66
+ count: Number of queries to return (default: 10)
67
+
68
+ Returns:
69
+ Dictionary with:
70
+ - total: Total number of queries matching criteria
71
+ - start: Starting offset
72
+ - count: Number of results returned
73
+ - queries: Array of query objects with:
74
+ - urn: Query identifier
75
+ - properties.statement.value: The actual SQL text
76
+ - properties.statement.language: Query language (SQL, etc.)
77
+ - properties.source: MANUAL or SYSTEM
78
+ - properties.name: Optional query name
79
+ - platform: Source platform
80
+ - subjects: Referenced datasets/columns (deduplicated to dataset URNs)
81
+
82
+ COMMON USE CASES:
83
+
84
+ 1. SQL Generation - Learn real query patterns:
85
+ get_dataset_queries(graph, urn, source="MANUAL", count=5-10)
86
+ → See how users actually write SQL against this table
87
+ → Discover common JOINs, aggregations, filters
88
+ → Match organizational SQL conventions and patterns
89
+
90
+ 2. Production usage analysis:
91
+ get_dataset_queries(graph, urn, source="SYSTEM", count=20)
92
+ → See how dashboards and reports query this data
93
+ → Understand which queries run in production
94
+ → Identify critical query patterns
95
+
96
+ 3. Column usage patterns:
97
+ get_dataset_queries(graph, urn, column="customer_id", source="MANUAL", count=5)
98
+ → See how a specific column is used in queries
99
+ → Learn filtering and grouping patterns for that column
100
+ → Discover relationships via JOIN patterns
101
+
102
+ 4. General usage exploration:
103
+ get_dataset_queries(graph, urn, count=10)
104
+ → Get mix of manual and system queries
105
+ → Understand overall table usage
106
+
107
+ EXAMPLES:
108
+
109
+ - Get manual queries for SQL generation:
110
+ get_dataset_queries(
111
+ urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,prod.sales.orders,PROD)",
112
+ source="MANUAL",
113
+ count=10
114
+ )
115
+
116
+ - Get dashboard queries (production usage):
117
+ get_dataset_queries(
118
+ urn="urn:li:dataset:(...)",
119
+ source="SYSTEM",
120
+ count=20
121
+ )
122
+
123
+ - Column-specific query patterns:
124
+ get_dataset_queries(
125
+ urn="urn:li:dataset:(...)",
126
+ column="created_at",
127
+ source="MANUAL",
128
+ count=5
129
+ )
130
+
131
+ Example:
132
+ from datahub_agent_context.context import DataHubContext
133
+
134
+ with DataHubContext(client.graph):
135
+ result = get_dataset_queries(urn="urn:li:dataset:(...)", source="MANUAL")
136
+
137
+ ANALYZING RETRIEVED QUERIES:
138
+ Once you retrieve queries, examine the SQL statements to identify:
139
+ - JOIN patterns: Which tables are joined? On what keys?
140
+ - Aggregations: Common SUM, COUNT, AVG, GROUP BY patterns
141
+ - Filters: Typical WHERE clauses, date range logic
142
+ - Column usage: Which columns appear frequently vs rarely
143
+ - CTEs and subqueries: Complex query structures
144
+
145
+ BEST PRACTICES:
146
+ - For SQL generation: Use source="MANUAL" (count=5-10) to see real user patterns
147
+ - For production analysis: Use source="SYSTEM" to see dashboard/report queries
148
+ - Start with moderate count (5-10) to avoid overwhelming context
149
+ - If no queries found (total=0), proceed without query examples - not all tables have queries
150
+ - Parse the SQL statements yourself to find patterns - they are not full-text searchable
151
+ """
152
+ graph = get_graph()
153
+ urn = maybe_convert_to_schema_field_urn(urn, column)
154
+
155
+ entities_filter = FilterDsl.custom_filter(
156
+ field="entities", condition="EQUAL", values=[urn]
157
+ )
158
+ _, compiled_filters = compile_filters(entities_filter)
159
+
160
+ # Set up variables for the query
161
+ variables = {
162
+ "input": {
163
+ "start": start,
164
+ "count": count,
165
+ "orFilters": compiled_filters,
166
+ }
167
+ }
168
+
169
+ # Add optional source filter
170
+ if source is not None:
171
+ variables["input"]["source"] = source
172
+
173
+ # Execute the GraphQL query
174
+ result = execute_graphql(
175
+ graph,
176
+ query=queries_gql,
177
+ variables=variables,
178
+ operation_name="listQueries",
179
+ )["listQueries"]
180
+
181
+ for query in result["queries"]:
182
+ if query.get("subjects"):
183
+ query["subjects"] = _deduplicate_subjects(query["subjects"])
184
+
185
+ # Truncate long SQL queries to prevent context window issues
186
+ if queryProperties := query.get("properties"):
187
+ queryProperties["statement"]["value"] = truncate_query(
188
+ queryProperties["statement"]["value"]
189
+ )
190
+
191
+ return clean_gql_response(result)