datahub-agent-context 1.4.0rc1__py3-none-any.whl → 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,9 +17,17 @@
17
17
  from datahub_agent_context._version import __version__
18
18
  from datahub_agent_context.context import (
19
19
  DataHubContext,
20
+ get_datahub_client,
20
21
  get_graph,
21
- reset_graph,
22
- set_graph,
22
+ reset_client,
23
+ set_client,
23
24
  )
24
25
 
25
- __all__ = ["__version__", "DataHubContext", "get_graph", "set_graph", "reset_graph"]
26
+ __all__ = [
27
+ "__version__",
28
+ "DataHubContext",
29
+ "get_datahub_client",
30
+ "get_graph",
31
+ "set_client",
32
+ "reset_client",
33
+ ]
@@ -13,4 +13,4 @@
13
13
  # limitations under the License.
14
14
 
15
15
  __package_name__ = "datahub-agent-context"
16
- __version__ = "1.4.0rc1"
16
+ __version__ = "1.4.0rc2"
@@ -1,6 +1,6 @@
1
1
  """Context management for DataHub tools.
2
2
 
3
- This module provides a context manager pattern for managing DataHubGraph instances
3
+ This module provides a context manager pattern for managing DataHubClient instances
4
4
  across tool calls without explicit parameter passing.
5
5
  """
6
6
 
@@ -9,56 +9,69 @@ from typing import TYPE_CHECKING, Optional
9
9
 
10
10
  if TYPE_CHECKING:
11
11
  from datahub.ingestion.graph.client import DataHubGraph
12
+ from datahub.sdk.main_client import DataHubClient
12
13
 
13
- # Context variable to store the current DataHubGraph instance
14
- _graph_context: contextvars.ContextVar[Optional["DataHubGraph"]] = (
15
- contextvars.ContextVar("datahub_graph", default=None)
14
+ # Context variable to store the current DataHubClient instance
15
+ _client_context: contextvars.ContextVar[Optional["DataHubClient"]] = (
16
+ contextvars.ContextVar("datahub_client", default=None)
16
17
  )
17
18
 
18
19
 
19
- def get_graph() -> "DataHubGraph":
20
- """Get the current DataHubGraph from context.
20
+ def get_datahub_client() -> "DataHubClient":
21
+ """Get the current DataHubClient from context.
21
22
 
22
23
  Returns:
23
- DataHubGraph instance from context
24
+ DataHubClient instance from context
24
25
 
25
26
  Raises:
26
- RuntimeError: If no graph is set in context
27
+ RuntimeError: If no client is set in context
27
28
  """
28
- graph = _graph_context.get()
29
- if graph is None:
29
+ client = _client_context.get()
30
+ if client is None:
30
31
  raise RuntimeError(
31
- "No DataHubGraph in context. "
32
- "Make sure to use DataHubContext context manager or set_graph() before calling tools."
32
+ "No DataHubClient in context. "
33
+ "Make sure to use DataHubContext context manager or set_client() before calling tools."
33
34
  )
34
- return graph
35
+ return client
36
+
37
+
38
+ def get_graph() -> "DataHubGraph":
39
+ """Get the current DataHubGraph from context (convenience method).
40
+
41
+ Returns:
42
+ DataHubGraph instance from the client in context
43
+
44
+ Raises:
45
+ RuntimeError: If no client is set in context
46
+ """
47
+ return get_datahub_client()._graph
35
48
 
36
49
 
37
- def set_graph(graph: "DataHubGraph") -> contextvars.Token:
38
- """Set the DataHubGraph in context.
50
+ def set_client(client: "DataHubClient") -> contextvars.Token:
51
+ """Set the DataHubClient in context.
39
52
 
40
53
  Args:
41
- graph: DataHubGraph instance to set
54
+ client: DataHubClient instance to set
42
55
 
43
56
  Returns:
44
57
  Token that can be used to reset the context
45
58
  """
46
- return _graph_context.set(graph)
59
+ return _client_context.set(client)
47
60
 
48
61
 
49
- def reset_graph(token: contextvars.Token) -> None:
50
- """Reset the DataHubGraph context to its previous value.
62
+ def reset_client(token: contextvars.Token) -> None:
63
+ """Reset the DataHubClient context to its previous value.
51
64
 
52
65
  Args:
53
- token: Token returned by set_graph()
66
+ token: Token returned by set_client()
54
67
  """
55
- _graph_context.reset(token)
68
+ _client_context.reset(token)
56
69
 
57
70
 
58
71
  class DataHubContext:
59
72
  """Context manager for DataHub tool execution.
60
73
 
61
- This context manager sets the DataHubGraph in context for the duration
74
+ This context manager sets the DataHubClient in context for the duration
62
75
  of the with block, allowing tools to access it without explicit parameter passing.
63
76
 
64
77
  Example:
@@ -68,30 +81,30 @@ class DataHubContext:
68
81
 
69
82
  client = DataHubClient(...)
70
83
 
71
- with DataHubContext(client.graph):
72
- results = search(query="users") # No graph parameter needed!
84
+ with DataHubContext(client):
85
+ results = search(query="users") # No client parameter needed!
73
86
  """
74
87
 
75
- def __init__(self, graph: "DataHubGraph"):
88
+ def __init__(self, client: "DataHubClient"):
76
89
  """Initialize the context manager.
77
90
 
78
91
  Args:
79
- graph: DataHubGraph instance to use in this context
92
+ client: DataHubClient instance to use in this context
80
93
  """
81
- self.graph = graph
94
+ self.client = client
82
95
  self._token: Optional[contextvars.Token] = None
83
96
 
84
- def __enter__(self) -> "DataHubGraph":
85
- """Enter the context and set the graph.
97
+ def __enter__(self) -> "DataHubClient":
98
+ """Enter the context and set the client.
86
99
 
87
100
  Returns:
88
- The DataHubGraph instance
101
+ The DataHubClient instance
89
102
  """
90
- self._token = set_graph(self.graph)
91
- return self.graph
103
+ self._token = set_client(self.client)
104
+ return self.client
92
105
 
93
106
  def __exit__(self, exc_type, exc_val, exc_tb) -> None:
94
- """Exit the context and reset the graph."""
107
+ """Exit the context and reset the client."""
95
108
  if self._token is not None:
96
- reset_graph(self._token)
109
+ reset_client(self._token)
97
110
  self._token = None
@@ -3,7 +3,7 @@
3
3
  import functools
4
4
  from typing import TYPE_CHECKING, Callable
5
5
 
6
- from datahub_agent_context.context import set_graph
6
+ from datahub_agent_context.context import set_client
7
7
  from datahub_agent_context.mcp_tools import get_me
8
8
  from datahub_agent_context.mcp_tools.documents import grep_documents, search_documents
9
9
  from datahub_agent_context.mcp_tools.domains import remove_domains, set_domains
@@ -32,6 +32,7 @@ from datahub_agent_context.mcp_tools.lineage import (
32
32
  )
33
33
  from datahub_agent_context.mcp_tools.owners import add_owners, remove_owners
34
34
  from datahub_agent_context.mcp_tools.queries import get_dataset_queries
35
+ from datahub_agent_context.mcp_tools.save_document import save_document
35
36
  from datahub_agent_context.mcp_tools.search import search
36
37
  from datahub_agent_context.mcp_tools.tags import add_tags, remove_tags
37
38
  from datahub_agent_context.mcp_tools.terms import (
@@ -57,14 +58,14 @@ def create_context_wrapper(func: Callable, client: "DataHubClient") -> Callable:
57
58
  @functools.wraps(func)
58
59
  def wrapper(*args, **kwargs):
59
60
  # Set graph in context for this function call
60
- token = set_graph(client._graph)
61
+ token = set_client(client)
61
62
  try:
62
63
  return func(*args, **kwargs)
63
64
  finally:
64
65
  # Always reset context, even if function raises
65
- from datahub_agent_context.context import reset_graph
66
+ from datahub_agent_context.context import reset_client
66
67
 
67
- reset_graph(token)
68
+ reset_client(token)
68
69
 
69
70
  return wrapper
70
71
 
@@ -123,5 +124,6 @@ def build_langchain_tools(
123
124
  tools.append(tool(create_context_wrapper(remove_tags, client)))
124
125
  tools.append(tool(create_context_wrapper(add_glossary_terms, client)))
125
126
  tools.append(tool(create_context_wrapper(remove_glossary_terms, client)))
127
+ tools.append(tool(create_context_wrapper(save_document, client)))
126
128
 
127
129
  return tools
@@ -0,0 +1,634 @@
1
+ """Document saving tool for DataHub MCP server.
2
+
3
+ This tool enables AI agents to save documents to DataHub's knowledge base.
4
+ Documents are organized under a configurable parent folder (default: "Shared"),
5
+ optionally with per-user subfolders for organization.
6
+
7
+ Configuration via environment variables:
8
+ - SAVE_DOCUMENT_TOOL_ENABLED: Set to "false" to disable this tool (default: enabled). Also requires TOOLS_IS_MUTATION_ENABLED enabled.
9
+ - SAVE_DOCUMENT_PARENT_TITLE: Custom title for the parent folder (default: "Shared")
10
+ - SAVE_DOCUMENT_ORGANIZE_BY_USER: Set to "true" to enable per-user organization (default: false)
11
+ - SAVE_DOCUMENT_RESTRICT_UPDATES: Set to "false" to allow updating any document (default: true - only agent-created docs can be updated)
12
+ """
13
+
14
+ import logging
15
+ import os
16
+ import re
17
+ import uuid
18
+ from datetime import datetime
19
+ from typing import Dict, List, Literal, Optional, Tuple
20
+
21
+ from datahub.metadata import schema_classes as models
22
+ from datahub.sdk import Document
23
+ from datahub_agent_context.context import get_datahub_client
24
+ from datahub_agent_context.mcp_tools.base import execute_graphql
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Fixed root parent document ID - independent of title for future flexibility
29
+ ROOT_PARENT_DOC_ID = "__system_shared_documents"
30
+
31
+
32
+ def _get_parent_title() -> str:
33
+ """Get the configurable parent document title from environment."""
34
+ return os.environ.get("SAVE_DOCUMENT_PARENT_TITLE", "Shared")
35
+
36
+
37
+ def _is_organize_by_user_enabled() -> bool:
38
+ """Check if per-user organization is enabled (default: False)."""
39
+ value = os.environ.get("SAVE_DOCUMENT_ORGANIZE_BY_USER", "false")
40
+ return value.lower() in ("true", "1", "yes")
41
+
42
+
43
+ def _restrict_updates_to_shared_folder() -> bool:
44
+ """Check if updates should be restricted to the shared folder (default: True).
45
+
46
+ When enabled, only documents inside the shared folder can be updated.
47
+ This prevents accidental modification of user-created or imported documents.
48
+ """
49
+ value = os.environ.get("SAVE_DOCUMENT_RESTRICT_UPDATES", "true")
50
+ return value.lower() in ("true", "1", "yes")
51
+
52
+
53
+ def _make_safe_id(text: str, max_length: int = 30) -> str:
54
+ """Convert text to a safe ID string."""
55
+ safe_id = "".join(c if c.isalnum() else "-" for c in text.lower())[:max_length]
56
+ safe_id = re.sub(r"-+", "-", safe_id) # Collapse multiple dashes
57
+ return safe_id.strip("-")
58
+
59
+
60
+ def _get_root_parent_id() -> str:
61
+ """Get the root parent document ID.
62
+
63
+ Uses a fixed ID independent of the display title to allow changing
64
+ the title without requiring data migration.
65
+ """
66
+ return ROOT_PARENT_DOC_ID
67
+
68
+
69
+ def _get_root_parent_urn() -> str:
70
+ """Get the root parent document URN."""
71
+ return f"urn:li:document:{_get_root_parent_id()}"
72
+
73
+
74
+ # Supported document types (subtypes)
75
+ DocumentType = Literal[
76
+ "Insight",
77
+ "Decision",
78
+ "FAQ",
79
+ "Analysis",
80
+ "Summary",
81
+ "Recommendation",
82
+ "Note",
83
+ "Context",
84
+ ]
85
+
86
+
87
+ def _get_current_user_info() -> Optional[Dict]:
88
+ """Fetch the current authenticated user's information."""
89
+
90
+ client = get_datahub_client()
91
+
92
+ query = """
93
+ query getMe {
94
+ me {
95
+ corpUser {
96
+ urn
97
+ username
98
+ info {
99
+ displayName
100
+ fullName
101
+ firstName
102
+ lastName
103
+ }
104
+ editableProperties {
105
+ displayName
106
+ }
107
+ }
108
+ }
109
+ }
110
+ """
111
+
112
+ try:
113
+ result = execute_graphql(
114
+ client._graph,
115
+ query=query,
116
+ variables={},
117
+ operation_name="getMe",
118
+ )
119
+ me_data = result.get("me", {})
120
+ return me_data.get("corpUser") if me_data else None
121
+ except Exception as e:
122
+ logger.warning(f"Failed to get current user info: {e}")
123
+ return None
124
+
125
+
126
+ def _get_user_display_name(user_info: Optional[Dict]) -> str:
127
+ """Extract the best display name from user info."""
128
+ if not user_info:
129
+ return "Unknown User"
130
+
131
+ # Try editable displayName first, then info fields
132
+ editable = user_info.get("editableProperties") or {}
133
+ info = user_info.get("info") or {}
134
+
135
+ return (
136
+ editable.get("displayName")
137
+ or info.get("displayName")
138
+ or info.get("fullName")
139
+ or f"{info.get('firstName', '')} {info.get('lastName', '')}".strip()
140
+ or user_info.get("username")
141
+ or "Unknown User"
142
+ )
143
+
144
+
145
+ def _generate_document_id() -> str:
146
+ """Generate a unique document ID using UUID.
147
+
148
+ Each save creates a new document with a unique ID.
149
+ Format: shared-<uuid>
150
+ """
151
+ unique_id = str(uuid.uuid4())
152
+ return f"shared-{unique_id}"
153
+
154
+
155
+ def _is_document_in_shared_folder(document_urn: str) -> Tuple[bool, Optional[str]]:
156
+ """Check if a document is within the shared documents folder.
157
+
158
+ Simple validation: document must have the shared folder as a parent/ancestor.
159
+
160
+ Returns:
161
+ Tuple of (is_valid, error_message)
162
+ - (True, None) if document is in the shared folder
163
+ - (False, error_message) if document is outside the folder
164
+ """
165
+
166
+ client = get_datahub_client()
167
+ root_parent_urn = _get_root_parent_urn()
168
+
169
+ # Can't update the root folder itself
170
+ if document_urn == root_parent_urn:
171
+ return False, (
172
+ "Cannot update the root shared documents folder. "
173
+ "Only documents within this folder can be updated."
174
+ )
175
+
176
+ try:
177
+ # Fetch the document
178
+ doc = client.entities.get(document_urn)
179
+ logger.debug(
180
+ f"Validating document {document_urn} for update, fetched: {doc is not None}"
181
+ )
182
+ if doc is None:
183
+ # Document doesn't exist yet - allow (will be created)
184
+ logger.debug(f"Document {document_urn} does not exist, allowing update")
185
+ return True, None
186
+
187
+ # Get documentInfo aspect
188
+ aspects = getattr(doc, "aspects", None) or getattr(doc, "_aspects", {})
189
+ logger.debug(
190
+ f"Document aspects type: {type(aspects)}, keys: {list(aspects.keys()) if isinstance(aspects, dict) else 'N/A'}"
191
+ )
192
+ doc_info = aspects.get("documentInfo") if isinstance(aspects, dict) else None
193
+
194
+ if doc_info is None:
195
+ logger.debug(f"Document {document_urn} has no documentInfo aspect")
196
+ return False, (
197
+ f"Document '{document_urn}' has no document info. "
198
+ "Cannot verify it's in the shared folder."
199
+ )
200
+
201
+ # Walk up the parent chain looking for the shared folder
202
+ # parentDocument is a ParentDocumentClass with a .document field containing the URN
203
+ parent_doc_obj = getattr(doc_info, "parentDocument", None)
204
+ logger.debug(f"Document parentDocument object: {parent_doc_obj}")
205
+ current_parent_urn = (
206
+ getattr(parent_doc_obj, "document", None) if parent_doc_obj else None
207
+ )
208
+ logger.debug(
209
+ f"Document parent URN: {current_parent_urn}, looking for root: {root_parent_urn}"
210
+ )
211
+ visited = set()
212
+
213
+ while current_parent_urn:
214
+ if current_parent_urn in visited:
215
+ break
216
+ visited.add(current_parent_urn)
217
+
218
+ # Found the shared folder - document is valid
219
+ if current_parent_urn == root_parent_urn:
220
+ return True, None
221
+
222
+ # Fetch parent and continue walking up
223
+ try:
224
+ parent_doc = client.entities.get(current_parent_urn)
225
+ if parent_doc is None:
226
+ break
227
+ parent_aspects = getattr(parent_doc, "aspects", None) or getattr(
228
+ parent_doc, "_aspects", {}
229
+ )
230
+ parent_info = (
231
+ parent_aspects.get("documentInfo")
232
+ if isinstance(parent_aspects, dict)
233
+ else None
234
+ )
235
+ if parent_info is None:
236
+ break
237
+ # Get next parent - again, it's a ParentDocumentClass object
238
+ next_parent_obj = getattr(parent_info, "parentDocument", None)
239
+ current_parent_urn = (
240
+ getattr(next_parent_obj, "document", None)
241
+ if next_parent_obj
242
+ else None
243
+ )
244
+ except Exception:
245
+ break
246
+
247
+ return False, (
248
+ f"Document '{document_urn}' is not in the shared documents folder. "
249
+ "Only documents in this folder can be updated."
250
+ )
251
+
252
+ except Exception as e:
253
+ logger.error(f"Failed to validate document hierarchy: {e}", exc_info=True)
254
+ # Fail closed - if we can't validate, don't allow the update
255
+ return False, (
256
+ f"Failed to validate document hierarchy for '{document_urn}': {str(e)}. "
257
+ "Cannot update document without verifying it's in the shared folder."
258
+ )
259
+
260
+
261
+ def _ensure_document_exists(
262
+ doc_id: str,
263
+ title: str,
264
+ description: str,
265
+ parent_urn: Optional[str] = None,
266
+ ) -> str:
267
+ """Ensure a document exists, creating it if necessary. Returns the URN."""
268
+
269
+ client = get_datahub_client()
270
+ doc_urn = f"urn:li:document:{doc_id}"
271
+
272
+ try:
273
+ existing = client.entities.get(doc_urn)
274
+ if existing is not None:
275
+ return doc_urn
276
+ except Exception:
277
+ pass
278
+
279
+ # Create the document
280
+ doc = Document.create_document(
281
+ id=doc_id,
282
+ title=title,
283
+ text=description,
284
+ subtype="Folder",
285
+ parent_document=parent_urn,
286
+ show_in_global_context=True,
287
+ )
288
+
289
+ try:
290
+ client.entities.upsert(doc)
291
+ logger.info(f"Created folder document: {doc_urn}")
292
+ except Exception as e:
293
+ logger.warning(f"Failed to create folder document (may already exist): {e}")
294
+
295
+ return doc_urn
296
+
297
+
298
+ def _ensure_parent_hierarchy(user_info: Optional[Dict]) -> Tuple[str, Optional[str]]:
299
+ """Ensure the parent document hierarchy exists.
300
+
301
+ Returns:
302
+ Tuple of (parent_urn_for_document, user_urn_if_available)
303
+ """
304
+ root_title = _get_parent_title()
305
+ root_id = _get_root_parent_id()
306
+
307
+ # Always create the root parent
308
+ root_urn = _ensure_document_exists(
309
+ doc_id=root_id,
310
+ title=root_title,
311
+ description="Contains shared documents authored through AI agents like Ask DataHub.",
312
+ parent_urn=None,
313
+ )
314
+
315
+ # If per-user organization is disabled, return root as parent
316
+ if not _is_organize_by_user_enabled():
317
+ return root_urn, user_info.get("urn") if user_info else None
318
+
319
+ # Create user-specific folder if we have user info
320
+ if user_info:
321
+ user_urn = user_info.get("urn")
322
+ username = user_info.get("username", "unknown")
323
+ display_name = _get_user_display_name(user_info)
324
+
325
+ # Create user folder under root
326
+ user_folder_id = f"agent-docs-user-{_make_safe_id(username, max_length=30)}"
327
+ user_folder_urn = _ensure_document_exists(
328
+ doc_id=user_folder_id,
329
+ title=display_name,
330
+ description=f"Contains documents authored in sessions for {display_name}.",
331
+ parent_urn=root_urn,
332
+ )
333
+ return user_folder_urn, user_urn
334
+
335
+ # No user info available, use root
336
+ return root_urn, None
337
+
338
+
339
+ def save_document(
340
+ document_type: DocumentType,
341
+ title: str,
342
+ content: str,
343
+ urn: Optional[str] = None,
344
+ topics: Optional[List[str]] = None,
345
+ related_documents: Optional[List[str]] = None,
346
+ related_assets: Optional[List[str]] = None,
347
+ ) -> dict:
348
+ """Save or update a STANDALONE document in DataHub's knowledge base. Once saved,
349
+ a document will be visible to all users of DataHub and to Ask DataHub AI assistant.
350
+
351
+ NOTE: This tool is for creating standalone documents (insights, FAQs, notes, etc.),
352
+ NOT for updating descriptions on data assets like datasets or dashboards.
353
+ Use update_description for asset descriptions.
354
+
355
+ WHEN TO USE THIS TOOL:
356
+
357
+ Use this tool when the user explicitly requests to save information:
358
+ - "Save this for later..."
359
+ - "Bookmark this.."
360
+ - "Document this insight.."
361
+ - "Remember this.."
362
+ - "Add this to our knowledge base.."
363
+ - "Create a document about this.."
364
+
365
+ Also SUGGEST using this tool when the user provides valuable information such as:
366
+ - Useful SQL queries they want to reuse
367
+ - Decisions about data modeling or architecture
368
+ - FAQs or common questions about data
369
+ - Analysis results worth sharing with the team
370
+ - Corrections or clarifications about data, service, business definitions, etc.
371
+
372
+ ⚠️ IMPORTANT: Before calling this tool, you SHOULD confirm with the user that
373
+ they want to save this document. Present the title, content summary,
374
+ and any related assets, and ask for their approval before proceeding. Do not attempt to save
375
+ information that would be private or user-specific.
376
+
377
+ This tool persists insights, decisions, FAQs, and other contextual information
378
+ as documents in DataHub. Documents are organized hierarchically:
379
+ - Under a configurable parent folder (default: "Shared" for global context)
380
+ - Optionally grouped by the user who authored them
381
+
382
+ UPSERT BEHAVIOR:
383
+ - If `urn` is NOT provided: Creates a NEW document with a unique URN
384
+ - If `urn` IS provided: Updates the EXISTING document with that URN
385
+
386
+ IMPORTANT USAGE GUIDELINES:
387
+ - Always confirm with the user before saving
388
+ - Provide a clear summary of what will be saved
389
+ - Ask if the user wants to proceed with creating/updating the document
390
+
391
+ REQUIRED PARAMETERS:
392
+
393
+ document_type: The type of document being saved. For example:
394
+ - "Insight": Data insights or discoveries
395
+ - "Decision": Documented decisions with rationale
396
+ - "FAQ": Frequently asked questions and answers
397
+ - "Analysis": Data analysis findings
398
+ - "Summary": Summaries of complex information
399
+ - "Recommendation": Suggested actions or improvements
400
+ - "Note": General notes or observations
401
+
402
+ title: A descriptive title for the document.
403
+ - Example: "Sales Data Quality Issues - Q4 2024"
404
+ - Example: "Decision: Deprecating Legacy Customer Table"
405
+
406
+ content: The full content of the document (supports markdown formatting).
407
+ - Can include headers, lists, code blocks, tables, etc.
408
+ - Example: "## Summary\\n\\nThe orders table shows 15% null values..."
409
+
410
+ OPTIONAL PARAMETERS:
411
+
412
+ urn: The URN of an existing document to update.
413
+ - ONLY use after a search_documents or get_entity call returns a document URN
414
+ - Example: "urn:li:document:agent-insight-abc123"
415
+ - If not provided, a new document is created with a unique URN
416
+ - If provided, the existing document is updated (upsert operation)
417
+
418
+ topics: List of topic tags for categorization and discovery (like a word cloud).
419
+ - These become searchable tags in DataHub that users can click to find related documents
420
+ - Example: ["data-quality", "customer-data", "Q4-2024"]
421
+ - Example: ["high-priority", "sales", "email", "null-values"]
422
+
423
+ related_documents: URNs of related documents.
424
+ - Example: ["urn:li:document:agent-insight-sales-abc123"]
425
+ - Creates links between related knowledge
426
+
427
+ related_assets: URNs of related data assets (tables, dashboards, etc).
428
+ - Example: ["urn:li:dataset:(urn:li:dataPlatform:snowflake,db.orders,PROD)"]
429
+ - Links the document to specific data assets in the catalog
430
+ - Users can then see this document when viewing those assets
431
+
432
+ Returns:
433
+ Dictionary with:
434
+ - success: Boolean indicating if the operation succeeded
435
+ - urn: The URN of the created/updated document
436
+ - message: Success or error message
437
+ - author: The user who authored the document (if available)
438
+
439
+ RECOMMENDED WORKFLOW:
440
+
441
+ 1. Gather information you want to save publicly
442
+ 2. Present a summary to the user:
443
+ "I'd like to save the following insight to DataHub:
444
+ - Title: High Null Rate in Customer Emails
445
+ - Type: Insight
446
+ - Related to: customers table
447
+ Would you like me to save this?"
448
+ 3. Only call save_document after user confirms
449
+
450
+ EXAMPLE USAGE:
451
+
452
+ 1. Create a new insight (after user confirmation):
453
+ save_document(
454
+ document_type="Insight",
455
+ title="High Null Rate in Customer Emails",
456
+ content="## Finding\\n\\n23% of customer records have null email...",
457
+ topics=["data-quality", "customer-data", "email", "high-severity"],
458
+ related_assets=["urn:li:dataset:(urn:li:dataPlatform:snowflake,customers,PROD)"]
459
+ )
460
+
461
+ 2. Update an existing document (after finding it via search_documents):
462
+ save_document(
463
+ urn="urn:li:document:agent-insight-abc123", # From search_documents result
464
+ document_type="Insight",
465
+ title="High Null Rate in Customer Emails (Updated)",
466
+ content="## Finding\\n\\nUpdated: Now 18% of customer records have null email...",
467
+ topics=["data-quality", "customer-data", "email", "resolved"]
468
+ )
469
+
470
+ 3. Document a decision:
471
+ save_document(
472
+ document_type="Decision",
473
+ title="Migrating to New Production Database",
474
+ content="## Decision\\n\\nWe will migrate to v2 schema...\\n\\n## Rationale\\n...",
475
+ topics=["architecture", "data-model", "migration", "approved"]
476
+ )
477
+ """
478
+
479
+ client = get_datahub_client()
480
+
481
+ # Validate inputs
482
+ if not title or not title.strip():
483
+ return {
484
+ "success": False,
485
+ "urn": None,
486
+ "message": "title cannot be empty",
487
+ "author": None,
488
+ }
489
+
490
+ if not content or not content.strip():
491
+ return {
492
+ "success": False,
493
+ "urn": None,
494
+ "message": "content cannot be empty",
495
+ "author": None,
496
+ }
497
+
498
+ valid_document_types = [
499
+ "Insight",
500
+ "Decision",
501
+ "FAQ",
502
+ "Analysis",
503
+ "Summary",
504
+ "Recommendation",
505
+ "Note",
506
+ "Context",
507
+ ]
508
+ if document_type not in valid_document_types:
509
+ return {
510
+ "success": False,
511
+ "urn": None,
512
+ "message": f"Invalid document_type '{document_type}'. Must be one of: {', '.join(valid_document_types)}",
513
+ "author": None,
514
+ }
515
+
516
+ # Validate URN format if provided
517
+ if urn is not None:
518
+ if not urn.startswith("urn:li:document:"):
519
+ return {
520
+ "success": False,
521
+ "urn": None,
522
+ "message": f"Invalid urn format '{urn}'. Must start with 'urn:li:document:'",
523
+ "author": None,
524
+ }
525
+
526
+ # Validate that the document is within the agent-authored hierarchy
527
+ # This prevents accidental modification of user-created or imported documents
528
+ if _restrict_updates_to_shared_folder():
529
+ is_valid, error_message = _is_document_in_shared_folder(urn)
530
+ if not is_valid:
531
+ return {
532
+ "success": False,
533
+ "urn": None,
534
+ "message": error_message,
535
+ "author": None,
536
+ }
537
+
538
+ is_update = True
539
+ document_urn = urn
540
+ # Extract document ID from URN
541
+ document_id = urn.replace("urn:li:document:", "")
542
+ else:
543
+ is_update = False
544
+ # Generate new document ID
545
+ document_id = _generate_document_id()
546
+ document_urn = f"urn:li:document:{document_id}"
547
+
548
+ try:
549
+ # Get current user info for attribution and organization
550
+ user_info = _get_current_user_info()
551
+ user_display_name = _get_user_display_name(user_info) if user_info else None
552
+ user_urn = user_info.get("urn") if user_info else None
553
+
554
+ # Ensure parent hierarchy exists and get the parent URN for our document
555
+ # For updates, we still want to maintain proper parent hierarchy
556
+ parent_urn, _ = _ensure_parent_hierarchy(user_info)
557
+
558
+ # Set current user as owner only for NEW documents (captures authorship)
559
+ # For updates, preserve existing ownership by not setting owners
560
+ # The SDK expects owner URNs as strings in a list
561
+ if is_update:
562
+ owners = None # Don't overwrite existing ownership on updates
563
+ logger.info("Updating existing document - preserving existing ownership")
564
+ else:
565
+ owners = [user_urn] if user_urn else None
566
+ logger.info(f"Creating new document - setting owners: {owners}")
567
+
568
+ # Convert topics to tag URNs (DataHub expects full URNs)
569
+ # TODO: Decide whether tags are the right abstraction here.
570
+ # Alternative: Use Structured Properties, custom properties, or custom label.
571
+ # Just needs to be searchable later on.
572
+ tag_urns = None
573
+ if topics:
574
+ tag_urns = [f"urn:li:tag:{topic}" for topic in topics]
575
+
576
+ # Create the document
577
+ doc = Document.create_document(
578
+ id=document_id,
579
+ title=title,
580
+ text=content,
581
+ subtype=document_type,
582
+ parent_document=parent_urn,
583
+ related_documents=related_documents,
584
+ related_assets=related_assets,
585
+ owners=owners,
586
+ tags=tag_urns, # Use topics as tags for searchability
587
+ show_in_global_context=True,
588
+ )
589
+
590
+ # Manually add DocumentSettings to ensure showInGlobalContext=True is set
591
+ # This is a workaround until the SDK is updated to always emit this aspect
592
+ # TODO: Remove this once SDK properly emits DocumentSettings for show_in_global_context=True
593
+ actor_urn = user_urn or "urn:li:corpuser:datahub"
594
+ settings_audit = models.AuditStampClass(
595
+ time=int(datetime.now().timestamp() * 1000),
596
+ actor=actor_urn,
597
+ )
598
+ document_settings = models.DocumentSettingsClass(
599
+ showInGlobalContext=True,
600
+ lastModified=settings_audit,
601
+ )
602
+ doc._set_aspect(document_settings)
603
+
604
+ # Log document details before upsert
605
+ logger.info(
606
+ f"Document to upsert: URN={document_urn}, owners={owners}, parent={parent_urn}"
607
+ )
608
+
609
+ # Upsert the document
610
+ try:
611
+ client.entities.upsert(doc)
612
+ logger.info("Upsert completed successfully")
613
+ except Exception as upsert_error:
614
+ logger.error(f"Failed to upsert document: {upsert_error}", exc_info=True)
615
+ raise
616
+
617
+ action = "updated" if is_update else "created"
618
+ logger.info(f"Successfully {action} document: {document_urn}")
619
+
620
+ return {
621
+ "success": True,
622
+ "urn": document_urn,
623
+ "message": f"Successfully {action} document: {title}",
624
+ "author": user_display_name,
625
+ }
626
+
627
+ except Exception as e:
628
+ logger.error(f"Failed to save document: {e}")
629
+ return {
630
+ "success": False,
631
+ "urn": None,
632
+ "message": f"Error saving document: {str(e)}",
633
+ "author": None,
634
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datahub-agent-context
3
- Version: 1.4.0rc1
3
+ Version: 1.4.0rc2
4
4
  Summary: DataHub Agent Context - MCP Tools for AI Agents
5
5
  Home-page: https://datahub.io/
6
6
  License: Apache License 2.0
@@ -28,28 +28,28 @@ Classifier: Environment :: MacOS X
28
28
  Classifier: Topic :: Software Development
29
29
  Requires-Python: >=3.9
30
30
  Description-Content-Type: text/markdown
31
- Requires-Dist: cachetools<7.0.0,>=5.0.0
32
- Requires-Dist: httpcore<2.0,>=1.0.9
33
- Requires-Dist: json-repair<1.0.0,>=0.25.0
34
31
  Requires-Dist: h11<1.0,>=0.16
35
- Requires-Dist: jmespath<2.0.0,>=1.0.0
36
- Requires-Dist: acryl-datahub==1.4.0rc1
37
- Requires-Dist: pydantic<3.0.0,>=2.0.0
38
32
  Requires-Dist: google-re2<2.0,>=1.0
33
+ Requires-Dist: acryl-datahub==1.4.0rc2
34
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
35
+ Requires-Dist: httpcore<2.0,>=1.0.9
36
+ Requires-Dist: jmespath<2.0.0,>=1.0.0
37
+ Requires-Dist: json-repair<1.0.0,>=0.25.0
38
+ Requires-Dist: cachetools<7.0.0,>=5.0.0
39
39
  Provides-Extra: dev
40
- Requires-Dist: tox<5.0.0,>=4.0.0; extra == "dev"
41
- Requires-Dist: types-PyYAML<7.0.0,>=6.0.0; extra == "dev"
42
- Requires-Dist: mypy==1.17.1; extra == "dev"
43
- Requires-Dist: types-jmespath<2.0.0,>=1.0.0; extra == "dev"
44
- Requires-Dist: snowflake-connector-python<4.0.0,>=3.0.0; extra == "dev"
45
- Requires-Dist: click<9.0.0,>=8.0.0; extra == "dev"
46
- Requires-Dist: types-toml<1.0.0,>=0.10.0; extra == "dev"
47
- Requires-Dist: pytest-cov<7.0.0,>=2.8.0; extra == "dev"
48
40
  Requires-Dist: ruff==0.11.7; extra == "dev"
49
- Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
50
- Requires-Dist: types-requests<3.0.0,>=2.0.0; extra == "dev"
51
41
  Requires-Dist: langchain-core<2.0.0,>=1.2.7; extra == "dev"
42
+ Requires-Dist: snowflake-connector-python<4.0.0,>=3.0.0; extra == "dev"
43
+ Requires-Dist: pytest-cov<7.0.0,>=2.8.0; extra == "dev"
52
44
  Requires-Dist: types-cachetools<7.0.0,>=5.0.0; extra == "dev"
45
+ Requires-Dist: types-toml<1.0.0,>=0.10.0; extra == "dev"
46
+ Requires-Dist: tox<5.0.0,>=4.0.0; extra == "dev"
47
+ Requires-Dist: click<9.0.0,>=8.0.0; extra == "dev"
48
+ Requires-Dist: types-requests<3.0.0,>=2.0.0; extra == "dev"
49
+ Requires-Dist: mypy==1.17.1; extra == "dev"
50
+ Requires-Dist: types-PyYAML<7.0.0,>=6.0.0; extra == "dev"
51
+ Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
52
+ Requires-Dist: types-jmespath<2.0.0,>=1.0.0; extra == "dev"
53
53
  Provides-Extra: langchain
54
54
  Requires-Dist: langchain-core<2.0.0,>=1.2.7; extra == "langchain"
55
55
  Provides-Extra: snowflake
@@ -111,7 +111,7 @@ from datahub_agent_context.mcp_tools.entities import get_entities
111
111
  client = DataHubClient.from_env()
112
112
 
113
113
  # Search for datasets
114
- with client.graph as graph:
114
+ with client as client:
115
115
  results = search(
116
116
  query="user_data",
117
117
  filters={"entity_type": ["dataset"]},
@@ -119,7 +119,7 @@ with client.graph as graph:
119
119
  )
120
120
 
121
121
  # Get detailed entity information
122
- with client.graph as graph:
122
+ with client as client:
123
123
  entities = get_entities(
124
124
  urns=[result["entity"]["urn"] for result in results["searchResults"]]
125
125
  )
@@ -181,6 +181,7 @@ agent = create_agent(model, tools=tools, system_prompt="...")
181
181
  - `add_owners()`, `remove_owners()` - Manage owners
182
182
  - `add_glossary_terms()`, `remove_glossary_terms()` - Manage glossary terms
183
183
  - `add_structured_properties()`, `remove_structured_properties()` - Manage structured properties
184
+ - `save_document()` - Save or update a Document.
184
185
 
185
186
  #### User Tools
186
187
 
@@ -1,10 +1,10 @@
1
- datahub_agent_context/__init__.py,sha256=VGBOuNztxuwUi5Ofnrpe7tw8EmUrQD-i-eMbSKwvMtU,890
2
- datahub_agent_context/_version.py,sha256=oeJ65E7WFebrM-WKg-BHPZ37x3hoyefmc19QzVAGYYs,648
1
+ datahub_agent_context/__init__.py,sha256=WgJFMZaA5ae_9ntP686UXd0TvZpbGwQRdSISi0JHsvU,967
2
+ datahub_agent_context/_version.py,sha256=sjXQOjG_dBO3UkwODDbFi7aLjoKMlkbubWQwtnl0qh0,648
3
3
  datahub_agent_context/cli.py,sha256=ND0KLT3cFb6KnQl6kEb7B74tAOu6yfS4dO6mJjZW1x4,4441
4
- datahub_agent_context/context.py,sha256=qRc44o38Y-LoDQH1oFm38hIatOWRKnRxxytQNIR93kU,2771
4
+ datahub_agent_context/context.py,sha256=wj9q9hGf72q6oarnfEFHzqgS-vwtMO79hhjz8GNC0QQ,3163
5
5
  datahub_agent_context/py.typed,sha256=kO13kg6OXApIRwKRcPpEOL09GZHx2Pk8Rp2KZpxv0lw,63
6
6
  datahub_agent_context/langchain_tools/__init__.py,sha256=M0tn6fD9qY5Wc1XdptQuIf_7MSKLX8OSBaBxcPo5wmw,259
7
- datahub_agent_context/langchain_tools/builder.py,sha256=X59zdmdUqltKiTo3HZrE4-JOd7CztGprW2O32jIYt2o,5145
7
+ datahub_agent_context/langchain_tools/builder.py,sha256=-h8IuFWfIJGhQXnuK5ASCMwhoNmIhxLV2yu3ZOZPwpg,5288
8
8
  datahub_agent_context/mcp_tools/__init__.py,sha256=7iUoWuT-KvszOqnmL3_co2LVQdhZtkQKRLRE98Hn8WM,1544
9
9
  datahub_agent_context/mcp_tools/_token_estimator.py,sha256=U0kTqPZKBkKwxe7JZaLxIIFEobNSrEEHoM4NQbrmmAE,2782
10
10
  datahub_agent_context/mcp_tools/base.py,sha256=UFqMe9yS-YikgaKOnu2DkaMLaHRzwSUBOFXOWxBjULA,11054
@@ -17,6 +17,7 @@ datahub_agent_context/mcp_tools/helpers.py,sha256=NRIoVEB62vDWDg26UOFv-IhM8mEQd4
17
17
  datahub_agent_context/mcp_tools/lineage.py,sha256=sJVR2jJkbGU_KjjtqZ8IJVOKDaIjDdtQKtAIxYWq71Q,26753
18
18
  datahub_agent_context/mcp_tools/owners.py,sha256=LGZ5n5a3xRKSttay2NLf_rq97_Dl9pGIcVFi-l7uJK8,11798
19
19
  datahub_agent_context/mcp_tools/queries.py,sha256=V4-yFcCi3c8r4Xy7XVKfQ7s3SsIWXMAHRrI8Sqf2g20,6864
20
+ datahub_agent_context/mcp_tools/save_document.py,sha256=r3nYUQSQjmgCybX7X39zu8Ef4FON284K_D1X9Y53cIE,23221
20
21
  datahub_agent_context/mcp_tools/search.py,sha256=z5Hy1jLV4uDO26nb_oFuP5w6GX0DYcYWRIWn3kDp7dY,9880
21
22
  datahub_agent_context/mcp_tools/structured_properties.py,sha256=amj7C-sbeAyctrXY_rpc2vCNTaJy2aTRx21TioeKEJk,15745
22
23
  datahub_agent_context/mcp_tools/tags.py,sha256=5_Wg1Jqf_FgPgYuUV5bDwQ6J8t_sECcSM5yVtwQruPs,10814
@@ -59,7 +60,7 @@ datahub_agent_context/snowflake/udfs/search_datahub.py,sha256=-El0JnpkClaaxX5tCZ
59
60
  datahub_agent_context/snowflake/udfs/search_documents.py,sha256=yrcLsSNyq_M-CsgwGrsgSXFVeusKAvuNqEI_DvHtC08,1944
60
61
  datahub_agent_context/snowflake/udfs/set_domains.py,sha256=nKFJ9ZMBCz9qWgJv-FkOkeEmRE-QYBUCKbHw1RFdJmc,1701
61
62
  datahub_agent_context/snowflake/udfs/update_description.py,sha256=laJjqqRARDy3VCo0xX_5lvTrZn2U3Dwm7XEliPdDRSA,2096
62
- datahub_agent_context-1.4.0rc1.dist-info/METADATA,sha256=ikdYmmpFuJB8wQ7Ybs4J-9VnMumP9xEV-qR6V4ScgT8,7756
63
- datahub_agent_context-1.4.0rc1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
64
- datahub_agent_context-1.4.0rc1.dist-info/top_level.txt,sha256=Tv1bg7ZwDOKM9u9RHj5m1Zbx2LDf4lVBBRNHi_gBBTI,22
65
- datahub_agent_context-1.4.0rc1.dist-info/RECORD,,
63
+ datahub_agent_context-1.4.0rc2.dist-info/METADATA,sha256=GzhPDkJ4HtHmsyCeu_Lm_z44amlbDRHevW7R9hSH0Y8,7795
64
+ datahub_agent_context-1.4.0rc2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
65
+ datahub_agent_context-1.4.0rc2.dist-info/top_level.txt,sha256=Tv1bg7ZwDOKM9u9RHj5m1Zbx2LDf4lVBBRNHi_gBBTI,22
66
+ datahub_agent_context-1.4.0rc2.dist-info/RECORD,,