datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. datahub_agent_context/__init__.py +25 -0
  2. datahub_agent_context/_version.py +16 -0
  3. datahub_agent_context/context.py +97 -0
  4. datahub_agent_context/langchain_tools/__init__.py +8 -0
  5. datahub_agent_context/langchain_tools/builder.py +127 -0
  6. datahub_agent_context/mcp_tools/__init__.py +46 -0
  7. datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
  8. datahub_agent_context/mcp_tools/base.py +325 -0
  9. datahub_agent_context/mcp_tools/descriptions.py +299 -0
  10. datahub_agent_context/mcp_tools/documents.py +473 -0
  11. datahub_agent_context/mcp_tools/domains.py +246 -0
  12. datahub_agent_context/mcp_tools/entities.py +349 -0
  13. datahub_agent_context/mcp_tools/get_me.py +99 -0
  14. datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
  15. datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
  16. datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
  17. datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
  18. datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
  19. datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
  20. datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
  21. datahub_agent_context/mcp_tools/gql/search.gql +242 -0
  22. datahub_agent_context/mcp_tools/helpers.py +448 -0
  23. datahub_agent_context/mcp_tools/lineage.py +698 -0
  24. datahub_agent_context/mcp_tools/owners.py +318 -0
  25. datahub_agent_context/mcp_tools/queries.py +191 -0
  26. datahub_agent_context/mcp_tools/search.py +239 -0
  27. datahub_agent_context/mcp_tools/structured_properties.py +447 -0
  28. datahub_agent_context/mcp_tools/tags.py +296 -0
  29. datahub_agent_context/mcp_tools/terms.py +295 -0
  30. datahub_agent_context/py.typed +2 -0
  31. datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
  32. datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
  33. datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
  34. datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,325 @@
1
+ import logging
2
+ import re
3
+ from typing import Any, Dict, Optional
4
+
5
+ import cachetools
6
+
7
+ from datahub.cli.env_utils import get_boolean_env_variable
8
+ from datahub.ingestion.graph.client import DataHubGraph
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Cache to track whether newer GMS fields are supported for each graph instance
13
+ # Key: id(graph), Value: bool indicating if newer GMS fields are supported
14
+ _newer_gms_fields_support_cache: dict[int, bool] = {}
15
+
16
+ # Default view configuration
17
+ DISABLE_DEFAULT_VIEW = get_boolean_env_variable(
18
+ "DATAHUB_MCP_DISABLE_DEFAULT_VIEW", default=False
19
+ )
20
+ VIEW_CACHE_TTL_SECONDS = 300 # 5 minutes
21
+
22
+
23
+ def execute_graphql(
24
+ graph: DataHubGraph,
25
+ *,
26
+ query: str,
27
+ operation_name: Optional[str] = None,
28
+ variables: Optional[Dict[str, Any]] = None,
29
+ ) -> Any:
30
+ graph_id = id(graph)
31
+ original_query = query # Keep original for fallback
32
+
33
+ # Detect if this is a DataHub Cloud instance
34
+ is_cloud = _is_datahub_cloud(graph)
35
+
36
+ # Process CLOUD tags
37
+ query = _enable_cloud_fields(query) if is_cloud else _disable_cloud_fields(query)
38
+
39
+ # Process NEWER_GMS tags
40
+ # Check if we've already determined newer GMS fields support for this graph
41
+ newer_gms_enabled_for_this_query = False
42
+ if graph_id in _newer_gms_fields_support_cache:
43
+ supports_newer_fields = _newer_gms_fields_support_cache[graph_id]
44
+ if supports_newer_fields:
45
+ query = _enable_newer_gms_fields(query)
46
+ newer_gms_enabled_for_this_query = True
47
+ else:
48
+ query = _disable_newer_gms_fields(query)
49
+ else:
50
+ # First attempt: try with newer GMS fields if it's detected as cloud
51
+ # (Cloud instances typically run newer GMS versions)
52
+ if is_cloud:
53
+ query = _enable_newer_gms_fields(query)
54
+ newer_gms_enabled_for_this_query = True
55
+ else:
56
+ query = _disable_newer_gms_fields(query)
57
+ # Cache the initial detection result
58
+ _newer_gms_fields_support_cache[graph_id] = is_cloud
59
+
60
+ logger.debug(
61
+ f"Executing GraphQL {operation_name or 'query'}: "
62
+ f"is_cloud={is_cloud}, newer_gms_enabled={newer_gms_enabled_for_this_query}"
63
+ )
64
+ logger.debug(
65
+ f"GraphQL query for {operation_name or 'query'}:\n{query}\nVariables: {variables}"
66
+ )
67
+
68
+ try:
69
+ # Execute the GraphQL query
70
+ result = graph.execute_graphql(
71
+ query=query, variables=variables, operation_name=operation_name
72
+ )
73
+ return result
74
+
75
+ except Exception as e:
76
+ error_msg = str(e)
77
+
78
+ # Check if this is a field validation error and we tried with newer GMS fields enabled
79
+ # Only retry if we had newer GMS fields enabled in the query that just failed
80
+ if _is_field_validation_error(error_msg) and newer_gms_enabled_for_this_query:
81
+ logger.warning(
82
+ f"GraphQL schema validation error detected for {operation_name or 'query'}. "
83
+ f"Retrying without newer GMS fields as fallback."
84
+ )
85
+ logger.exception(e)
86
+
87
+ # Update cache to indicate newer GMS fields are NOT supported
88
+ _newer_gms_fields_support_cache[graph_id] = False
89
+
90
+ # Retry with newer GMS fields disabled - process both tags again
91
+ try:
92
+ fallback_query = original_query
93
+ # Reprocess CLOUD tags
94
+ if is_cloud:
95
+ fallback_query = _enable_cloud_fields(fallback_query)
96
+ else:
97
+ fallback_query = _disable_cloud_fields(fallback_query)
98
+ # Disable newer GMS fields for fallback
99
+ fallback_query = _disable_newer_gms_fields(fallback_query)
100
+
101
+ logger.debug(
102
+ f"Retry {operation_name or 'query'} with NEWER_GMS fields disabled: "
103
+ f"is_cloud={is_cloud}"
104
+ )
105
+
106
+ result = graph.execute_graphql(
107
+ query=fallback_query,
108
+ variables=variables,
109
+ operation_name=operation_name,
110
+ )
111
+ logger.info(
112
+ f"Fallback query succeeded without newer GMS fields for operation: {operation_name}"
113
+ )
114
+ return result
115
+ except Exception as fallback_error:
116
+ logger.exception(
117
+ f"Fallback query also failed for {operation_name or 'query'}: {fallback_error}"
118
+ )
119
+ raise fallback_error
120
+ elif (
121
+ _is_field_validation_error(error_msg)
122
+ and not newer_gms_enabled_for_this_query
123
+ ):
124
+ # Field validation error but NEWER_GMS fields were already disabled
125
+ logger.error(
126
+ f"GraphQL schema validation error for {operation_name or 'query'} "
127
+ f"but NEWER_GMS fields were already disabled (is_cloud={is_cloud}). "
128
+ f"This may indicate a CLOUD-only field being used on a non-cloud instance, "
129
+ f"or a field that's unavailable in this GMS version."
130
+ )
131
+ logger.exception(e)
132
+
133
+ # Keep essential error logging for troubleshooting with full stack trace
134
+ logger.exception(
135
+ f"GraphQL {operation_name or 'query'} failed: {e}\n"
136
+ f"Cloud instance: {is_cloud}\n"
137
+ f"Newer GMS fields enabled: {_newer_gms_fields_support_cache.get(graph_id, 'unknown')}\n"
138
+ f"Variables: {variables}"
139
+ )
140
+ raise
141
+
142
+
143
+ def _is_datahub_cloud(graph: DataHubGraph) -> bool:
144
+ """Check if the graph instance is DataHub Cloud.
145
+
146
+ Cloud instances typically have newer GMS versions with additional fields.
147
+ This heuristic uses the presence of frontend_base_url to detect Cloud instances.
148
+ """
149
+ if get_boolean_env_variable("DISABLE_NEWER_GMS_FIELD_DETECTION", default=False):
150
+ logger.debug(
151
+ "Newer GMS field detection is disabled via DISABLE_NEWER_GMS_FIELD_DETECTION"
152
+ )
153
+ return False
154
+
155
+ is_cloud = hasattr(graph, "frontend_base_url") and graph.frontend_base_url
156
+ logger.debug(f"Cloud detection: {is_cloud}")
157
+ return bool(is_cloud)
158
+
159
+
160
+ def _is_field_validation_error(error_msg: str) -> bool:
161
+ """Check if the error is a GraphQL field/type validation or syntax error.
162
+
163
+ Includes InvalidSyntax because unknown types (like Document on older GMS)
164
+ cause syntax errors rather than validation errors.
165
+ """
166
+ return (
167
+ "FieldUndefined" in error_msg
168
+ or "ValidationError" in error_msg
169
+ or "InvalidSyntax" in error_msg
170
+ )
171
+
172
+
173
+ def _enable_newer_gms_fields(query: str) -> str:
174
+ """
175
+ Enable newer GMS fields by removing the #[NEWER_GMS] marker suffix.
176
+
177
+ Converts:
178
+ someField #[NEWER_GMS]
179
+ To:
180
+ someField
181
+ """
182
+ lines = query.split("\n")
183
+ cleaned_lines = [
184
+ line.replace(" #[NEWER_GMS]", "").replace("\t#[NEWER_GMS]", "")
185
+ for line in lines
186
+ ]
187
+ return "\n".join(cleaned_lines)
188
+
189
+
190
+ def _disable_newer_gms_fields(query: str) -> str:
191
+ """
192
+ Disable newer GMS fields by commenting out lines with #[NEWER_GMS] marker.
193
+
194
+ Converts:
195
+ someField #[NEWER_GMS]
196
+ To:
197
+ # someField #[NEWER_GMS]
198
+ """
199
+ lines = query.split("\n")
200
+ processed_lines = []
201
+ for line in lines:
202
+ if "#[NEWER_GMS]" in line:
203
+ # Comment out the line by prefixing with #
204
+ processed_lines.append("# " + line)
205
+ else:
206
+ processed_lines.append(line)
207
+ return "\n".join(processed_lines)
208
+
209
+
210
+ def _enable_cloud_fields(query: str) -> str:
211
+ """
212
+ Enable cloud fields by removing the #[CLOUD] marker suffix.
213
+
214
+ Converts:
215
+ someField #[CLOUD]
216
+ To:
217
+ someField
218
+ """
219
+ lines = query.split("\n")
220
+ cleaned_lines = [
221
+ line.replace(" #[CLOUD]", "").replace("\t#[CLOUD]", "") for line in lines
222
+ ]
223
+ return "\n".join(cleaned_lines)
224
+
225
+
226
+ def _disable_cloud_fields(query: str) -> str:
227
+ """
228
+ Disable cloud fields by commenting out lines with #[CLOUD] marker.
229
+
230
+ Converts:
231
+ someField #[CLOUD]
232
+ To:
233
+ # someField #[CLOUD]
234
+ """
235
+ lines = query.split("\n")
236
+ processed_lines = []
237
+ for line in lines:
238
+ if "#[CLOUD]" in line:
239
+ # Comment out the line by prefixing with #
240
+ processed_lines.append("# " + line)
241
+ else:
242
+ processed_lines.append(line)
243
+ return "\n".join(processed_lines)
244
+
245
+
246
+ @cachetools.cached(cache=cachetools.TTLCache(maxsize=1, ttl=VIEW_CACHE_TTL_SECONDS))
247
+ def fetch_global_default_view(graph: DataHubGraph) -> Optional[str]:
248
+ """
249
+ Fetch the organization's default global view URN unless disabled.
250
+ Cached for VIEW_CACHE_TTL_SECONDS seconds.
251
+ Returns None if disabled or if no default view is configured.
252
+ """
253
+ # Return None immediately if feature is disabled
254
+ if DISABLE_DEFAULT_VIEW:
255
+ return None
256
+
257
+ query = """
258
+ query getGlobalViewsSettings {
259
+ globalViewsSettings {
260
+ defaultView
261
+ }
262
+ }
263
+ """
264
+
265
+ result = execute_graphql(graph, query=query)
266
+ settings = result.get("globalViewsSettings")
267
+ if settings:
268
+ view_urn = settings.get("defaultView")
269
+ if view_urn:
270
+ logger.debug(f"Fetched global default view: {view_urn}")
271
+ return view_urn
272
+ logger.debug("No global default view configured")
273
+ return None
274
+
275
+
276
+ def clean_gql_response(response: Any) -> Any:
277
+ """
278
+ Clean GraphQL response by removing metadata and empty values.
279
+
280
+ Recursively removes:
281
+ - __typename fields (GraphQL metadata not useful for consumers)
282
+ - None values
283
+ - Empty arrays []
284
+ - Empty dicts {} (after cleaning)
285
+ - Base64-encoded images from description fields (can be huge - 2MB!)
286
+
287
+ Args:
288
+ response: Raw GraphQL response (dict, list, or primitive)
289
+
290
+ Returns:
291
+ Cleaned response with same structure but without noise
292
+ """
293
+ if isinstance(response, dict):
294
+ banned_keys = {
295
+ "__typename",
296
+ }
297
+
298
+ cleaned_response = {}
299
+ for k, v in response.items():
300
+ if k in banned_keys or v is None or v == []:
301
+ continue
302
+ cleaned_v = clean_gql_response(v)
303
+ # Strip base64 images from description fields
304
+ if (
305
+ k == "description"
306
+ and isinstance(cleaned_v, str)
307
+ and "base64" in cleaned_v
308
+ ):
309
+ cleaned_v = re.sub(
310
+ r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+",
311
+ "[image removed]",
312
+ cleaned_v,
313
+ )
314
+ cleaned_v = re.sub(
315
+ r"!\[[^\]]*\]\(data:image/[^)]+\)", "[image removed]", cleaned_v
316
+ )
317
+
318
+ if cleaned_v is not None and cleaned_v != {}:
319
+ cleaned_response[k] = cleaned_v
320
+
321
+ return cleaned_response
322
+ elif isinstance(response, list):
323
+ return [clean_gql_response(item) for item in response]
324
+ else:
325
+ return response
@@ -0,0 +1,299 @@
1
+ """Description management tools for DataHub MCP server."""
2
+
3
+ import logging
4
+ from typing import Literal, Optional
5
+
6
+ from datahub_agent_context.context import get_graph
7
+ from datahub_agent_context.mcp_tools.base import execute_graphql
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _get_existing_description(entity_urn: str, column_path: Optional[str]) -> str:
13
+ """Fetch existing description for entity or column."""
14
+ graph = get_graph()
15
+ query = """
16
+ query getEntity($urn: String!) {
17
+ entity(urn: $urn) {
18
+ ... on Dataset {
19
+ editableProperties {
20
+ description
21
+ }
22
+ schemaMetadata {
23
+ fields {
24
+ fieldPath
25
+ description
26
+ }
27
+ }
28
+ }
29
+ ... on Container {
30
+ editableProperties {
31
+ description
32
+ }
33
+ }
34
+ ... on Chart {
35
+ editableProperties {
36
+ description
37
+ }
38
+ }
39
+ ... on Dashboard {
40
+ editableProperties {
41
+ description
42
+ }
43
+ }
44
+ ... on DataFlow {
45
+ editableProperties {
46
+ description
47
+ }
48
+ }
49
+ ... on DataJob {
50
+ editableProperties {
51
+ description
52
+ }
53
+ }
54
+ ... on MLModel {
55
+ editableProperties {
56
+ description
57
+ }
58
+ }
59
+ ... on MLModelGroup {
60
+ editableProperties {
61
+ description
62
+ }
63
+ }
64
+ ... on MLFeatureTable {
65
+ editableProperties {
66
+ description
67
+ }
68
+ }
69
+ ... on MLPrimaryKey {
70
+ editableProperties {
71
+ description
72
+ }
73
+ }
74
+ ... on Tag {
75
+ properties {
76
+ description
77
+ }
78
+ }
79
+ ... on GlossaryTerm {
80
+ properties {
81
+ description
82
+ }
83
+ }
84
+ ... on GlossaryNode {
85
+ properties {
86
+ description
87
+ }
88
+ }
89
+ ... on Domain {
90
+ properties {
91
+ description
92
+ }
93
+ }
94
+ }
95
+ }
96
+ """
97
+
98
+ try:
99
+ result = execute_graphql(
100
+ graph,
101
+ query=query,
102
+ variables={"urn": entity_urn},
103
+ operation_name="getEntity",
104
+ )
105
+
106
+ entity_data = result.get("entity", {})
107
+ if column_path:
108
+ # Get column description
109
+ schema_metadata = entity_data.get("schemaMetadata", {})
110
+ fields = schema_metadata.get("fields", [])
111
+ for field in fields:
112
+ if field.get("fieldPath") == column_path:
113
+ return field.get("description", "")
114
+ return ""
115
+ else:
116
+ # Get entity description
117
+ # Try editableProperties first (for Dataset, Container, etc.)
118
+ editable_props = entity_data.get("editableProperties", {})
119
+ existing_description = editable_props.get("description", "")
120
+
121
+ # If not found, try properties (for Tag, GlossaryTerm, etc.)
122
+ if not existing_description:
123
+ properties = entity_data.get("properties", {})
124
+ existing_description = properties.get("description", "")
125
+
126
+ return existing_description
127
+
128
+ except Exception as e:
129
+ logger.warning(
130
+ f"Failed to fetch existing description for {entity_urn}: {e}. Will treat as empty."
131
+ )
132
+ return ""
133
+
134
+
135
+ def update_description(
136
+ entity_urn: str,
137
+ operation: Literal["replace", "append", "remove"] = "replace",
138
+ description: Optional[str] = None,
139
+ column_path: Optional[str] = None,
140
+ ) -> dict:
141
+ """Update description for a DataHub entity or its column (e.g., schema field).
142
+
143
+ This tool allows you to set, append to, or remove a description for an entity or its column.
144
+ Useful for documenting datasets, containers, charts, dashboards, data flows, data jobs,
145
+ ML models, ML model groups, ML feature tables, ML primary keys, tags, glossary terms,
146
+ glossary nodes, domains, and schema fields.
147
+
148
+ Args:
149
+ entity_urn: Entity URN to update description for (e.g., dataset URN, container URN)
150
+ operation: The operation to perform:
151
+ - "replace": Replace the existing description with the new one (default)
152
+ - "append": Append the new description to the existing one
153
+ - "remove": Remove the description (description parameter not needed)
154
+ description: The description text to set or append (supports markdown formatting).
155
+ Required for "replace" and "append" operations, ignored for "remove".
156
+ column_path: Column_path identifier (e.g., column name for schema field).
157
+ Optional for all entity types (use None for entity-level descriptions).
158
+ For column-level descriptions, provide the column name (e.g., "customer_email").
159
+ Verify that the column_path is correct and valid via the schemaMetadata.
160
+ Use get_entity tool to verify.
161
+
162
+ Returns:
163
+ Dictionary with:
164
+ - success: Boolean indicating if the operation succeeded
165
+ - urn: The entity URN
166
+ - column_path: The column path (if applicable)
167
+ - message: Success or error message
168
+
169
+ Examples:
170
+ # Update description for a container (entity-level)
171
+ update_description(
172
+ entity_urn="urn:li:container:12345",
173
+ operation="replace",
174
+ description="Production data warehouse"
175
+ )
176
+
177
+ # Update description for a dataset
178
+ update_description(
179
+ entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
180
+ operation="replace",
181
+ description="User's table",
182
+ )
183
+
184
+ # Update description for a dataset field (column-level)
185
+ update_description(
186
+ entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
187
+ operation="replace",
188
+ description="User's primary email address",
189
+ column_path="email"
190
+ )
191
+
192
+ # Append to existing field description
193
+ update_description(
194
+ entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
195
+ operation="append",
196
+ description=" (PII)",
197
+ column_path="email"
198
+ )
199
+
200
+ # Remove field description
201
+ update_description(
202
+ entity_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
203
+ operation="remove",
204
+ column_path="old_field"
205
+ )
206
+
207
+ Example:
208
+ from datahub_agent_context.context import DataHubContext
209
+
210
+ with DataHubContext(client.graph):
211
+ result = update_description(
212
+ entity_urn="urn:li:dataset:(...)",
213
+ operation="replace",
214
+ description="User table"
215
+ )
216
+ """
217
+ graph = get_graph()
218
+ # Validate inputs
219
+ if not entity_urn:
220
+ raise ValueError("entity_urn cannot be empty")
221
+
222
+ if operation in ("replace", "append"):
223
+ if not description:
224
+ raise ValueError(f"description is required for '{operation}' operation")
225
+ elif operation == "remove":
226
+ # For remove operation, ignore description parameter
227
+ description = ""
228
+ else:
229
+ raise ValueError(
230
+ f"Invalid operation '{operation}'. Must be 'replace', 'append', or 'remove'"
231
+ )
232
+
233
+ # For append operation, we need to fetch existing description first
234
+ existing_description = ""
235
+ if operation == "append":
236
+ existing_description = _get_existing_description(entity_urn, column_path)
237
+
238
+ # Determine final description based on operation
239
+ if operation == "append":
240
+ final_description = (
241
+ existing_description + description if existing_description else description
242
+ )
243
+ elif operation == "remove":
244
+ final_description = ""
245
+ else: # replace
246
+ final_description = description
247
+
248
+ # GraphQL mutation
249
+ mutation = """
250
+ mutation updateDescription($input: DescriptionUpdateInput!) {
251
+ updateDescription(input: $input)
252
+ }
253
+ """
254
+
255
+ variables: dict = {
256
+ "input": {
257
+ "description": final_description,
258
+ "resourceUrn": entity_urn,
259
+ }
260
+ }
261
+
262
+ # Add subresource fields if provided (for column-level descriptions)
263
+ if column_path:
264
+ variables["input"]["subResource"] = column_path
265
+ variables["input"]["subResourceType"] = "DATASET_FIELD"
266
+
267
+ try:
268
+ result = execute_graphql(
269
+ graph,
270
+ query=mutation,
271
+ variables=variables,
272
+ operation_name="updateDescription",
273
+ )
274
+
275
+ if result.get("updateDescription", False):
276
+ action_verb = "updated" if operation in ("replace", "append") else "removed"
277
+ return {
278
+ "success": True,
279
+ "urn": entity_urn,
280
+ "column_path": column_path,
281
+ "message": f"Description {action_verb} successfully",
282
+ }
283
+ else:
284
+ action = "update" if operation in ("replace", "append") else "remove"
285
+ raise RuntimeError(
286
+ f"Failed to {action} description for {entity_urn}"
287
+ + (f" column {column_path}" if column_path else "")
288
+ + " - operation returned false"
289
+ )
290
+
291
+ except Exception as e:
292
+ if isinstance(e, RuntimeError):
293
+ raise
294
+ action = "update" if operation in ("replace", "append") else "remove"
295
+ raise RuntimeError(
296
+ f"Error {action} description for {entity_urn}"
297
+ + (f" column {column_path}" if column_path else "")
298
+ + f": {str(e)}"
299
+ ) from e