datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. datahub_agent_context/__init__.py +25 -0
  2. datahub_agent_context/_version.py +16 -0
  3. datahub_agent_context/context.py +97 -0
  4. datahub_agent_context/langchain_tools/__init__.py +8 -0
  5. datahub_agent_context/langchain_tools/builder.py +127 -0
  6. datahub_agent_context/mcp_tools/__init__.py +46 -0
  7. datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
  8. datahub_agent_context/mcp_tools/base.py +325 -0
  9. datahub_agent_context/mcp_tools/descriptions.py +299 -0
  10. datahub_agent_context/mcp_tools/documents.py +473 -0
  11. datahub_agent_context/mcp_tools/domains.py +246 -0
  12. datahub_agent_context/mcp_tools/entities.py +349 -0
  13. datahub_agent_context/mcp_tools/get_me.py +99 -0
  14. datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
  15. datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
  16. datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
  17. datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
  18. datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
  19. datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
  20. datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
  21. datahub_agent_context/mcp_tools/gql/search.gql +242 -0
  22. datahub_agent_context/mcp_tools/helpers.py +448 -0
  23. datahub_agent_context/mcp_tools/lineage.py +698 -0
  24. datahub_agent_context/mcp_tools/owners.py +318 -0
  25. datahub_agent_context/mcp_tools/queries.py +191 -0
  26. datahub_agent_context/mcp_tools/search.py +239 -0
  27. datahub_agent_context/mcp_tools/structured_properties.py +447 -0
  28. datahub_agent_context/mcp_tools/tags.py +296 -0
  29. datahub_agent_context/mcp_tools/terms.py +295 -0
  30. datahub_agent_context/py.typed +2 -0
  31. datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
  32. datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
  33. datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
  34. datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,473 @@
1
+ """Document tools for DataHub MCP server."""
2
+
3
+ import pathlib
4
+ from typing import Any, Dict, List, Literal, Optional
5
+
6
+ import re2 # type: ignore[import-untyped]
7
+
8
+ from datahub_agent_context.context import get_graph
9
+ from datahub_agent_context.mcp_tools.base import (
10
+ clean_gql_response,
11
+ execute_graphql,
12
+ fetch_global_default_view,
13
+ )
14
+
15
+ # Load GraphQL queries at module level
16
+ document_search_gql = (
17
+ pathlib.Path(__file__).parent / "gql/document_search.gql"
18
+ ).read_text()
19
+ document_semantic_search_gql = (
20
+ pathlib.Path(__file__).parent / "gql/document_semantic_search.gql"
21
+ ).read_text()
22
+ read_documents_gql = (
23
+ pathlib.Path(__file__).parent / "gql/read_documents.gql"
24
+ ).read_text()
25
+
26
+
27
+ def search_documents(
28
+ query: str = "*",
29
+ platforms: Optional[List[str]] = None,
30
+ domains: Optional[List[str]] = None,
31
+ tags: Optional[List[str]] = None,
32
+ glossary_terms: Optional[List[str]] = None,
33
+ owners: Optional[List[str]] = None,
34
+ num_results: int = 10,
35
+ offset: int = 0,
36
+ ) -> dict:
37
+ """Search for documents stored in the customer's DataHub deployment.
38
+
39
+ These are the organization's own documents (runbooks, FAQs, knowledge articles)
40
+ ingested from sources like Notion, Confluence, etc.
41
+
42
+ Returns document metadata WITHOUT content to keep responses concise.
43
+ Use get_entities() with a document URN to retrieve full content when needed.
44
+
45
+ KEYWORD SEARCH:
46
+ - Full-text search with boolean logic
47
+ - Use /q prefix for structured queries
48
+ - Examples:
49
+ • /q deployment guide → documents containing both terms
50
+ • /q kubernetes OR k8s → documents with either term
51
+ • /q "production deployment" → exact phrase match
52
+
53
+ FILTERS - Narrow results by metadata:
54
+
55
+ platforms: Filter by source platform (use full URN)
56
+ - Examples: ["urn:li:dataPlatform:notion"], ["urn:li:dataPlatform:confluence"]
57
+
58
+ domains: Filter by business domain (use full URN)
59
+ - Examples: ["urn:li:domain:engineering"], ["urn:li:domain:data-platform"]
60
+
61
+ tags: Filter by tags (use full URN)
62
+ - Examples: ["urn:li:tag:critical"], ["urn:li:tag:deprecated"]
63
+
64
+ glossary_terms: Filter by glossary terms (use full URN)
65
+ - Examples: ["urn:li:glossaryTerm:pii"], ["urn:li:glossaryTerm:gdpr"]
66
+
67
+ owners: Filter by document owners (use full URN)
68
+ - Examples: ["urn:li:corpuser:alice"], ["urn:li:corpGroup:platform-team"]
69
+
70
+ PAGINATION:
71
+ - num_results: Number of results per page (max: 50)
72
+ - offset: Starting position (default: 0)
73
+
74
+ FACET DISCOVERY:
75
+ - Set num_results=0 to get ONLY facets (no results)
76
+ - Useful for discovering what platforms, domains exist
77
+
78
+ EXAMPLE WORKFLOWS:
79
+
80
+ 1. Find Notion docs about deployment:
81
+ search_documents(query="deployment", platforms=["urn:li:dataPlatform:notion"])
82
+
83
+ 2. Discover document sources:
84
+ search_documents(num_results=0)
85
+ → Examine facets to see available platforms, domains
86
+
87
+ 3. Find engineering team's critical docs:
88
+ search_documents(
89
+ domains=["urn:li:domain:engineering"],
90
+ tags=["urn:li:tag:critical"]
91
+ )
92
+
93
+ Args:
94
+ query: Search query string
95
+ platforms: List of platform URNs to filter by
96
+ domains: List of domain URNs to filter by
97
+ tags: List of tag URNs to filter by
98
+ glossary_terms: List of glossary term URNs to filter by
99
+ owners: List of owner URNs to filter by
100
+ num_results: Number of results to return (max 50)
101
+ offset: Starting position for pagination
102
+
103
+ Returns:
104
+ Dictionary with search results and facets
105
+
106
+ Example:
107
+ from datahub_agent_context.context import DataHubContext
108
+
109
+ with DataHubContext(client.graph):
110
+ result = search_documents(query="deployment")
111
+ """
112
+ graph = get_graph()
113
+ return _search_documents_impl(
114
+ graph,
115
+ query=query,
116
+ search_strategy="keyword",
117
+ platforms=platforms,
118
+ domains=domains,
119
+ tags=tags,
120
+ glossary_terms=glossary_terms,
121
+ owners=owners,
122
+ num_results=num_results,
123
+ offset=offset,
124
+ )
125
+
126
+
127
+ def _search_documents_impl(
128
+ graph,
129
+ query: str = "*",
130
+ search_strategy: Optional[Literal["semantic", "keyword"]] = None,
131
+ sub_types: Optional[List[str]] = None,
132
+ platforms: Optional[List[str]] = None,
133
+ domains: Optional[List[str]] = None,
134
+ tags: Optional[List[str]] = None,
135
+ glossary_terms: Optional[List[str]] = None,
136
+ owners: Optional[List[str]] = None,
137
+ num_results: int = 10,
138
+ offset: int = 0,
139
+ ) -> dict:
140
+ """Search for documents stored in the customer's DataHub deployment.
141
+
142
+ These are the organization's own documents (runbooks, FAQs, knowledge articles)
143
+ ingested from sources like Notion, Confluence, etc. - not DataHub documentation.
144
+
145
+ Returns document metadata WITHOUT content to keep responses concise.
146
+ Use get_entities() with a document URN to retrieve full content when needed.
147
+
148
+ SEARCH STRATEGIES:
149
+
150
+ SEMANTIC SEARCH (search_strategy="semantic"):
151
+ - Uses AI embeddings to find conceptually related documents
152
+ - Best for: natural language queries, finding related topics
153
+ - Example: "how to deploy" finds deployment guides, CI/CD docs, release runbooks
154
+
155
+ KEYWORD SEARCH (search_strategy="keyword" or default):
156
+ - Full-text search with boolean logic
157
+ - Use /q prefix for structured queries
158
+ - Examples:
159
+ • /q deployment guide → documents containing both terms
160
+ • /q kubernetes OR k8s → documents with either term
161
+ • /q "production deployment" → exact phrase match
162
+
163
+ FILTERS - Narrow results by metadata:
164
+
165
+ sub_types: Filter by document type
166
+ - Examples: ["Runbook"], ["FAQ", "Tutorial"], ["Reference"]
167
+
168
+ platforms: Filter by source platform (use full URN)
169
+ - Examples: ["urn:li:dataPlatform:notion"], ["urn:li:dataPlatform:confluence"]
170
+
171
+ domains: Filter by business domain (use full URN)
172
+ - Examples: ["urn:li:domain:engineering"], ["urn:li:domain:data-platform"]
173
+
174
+ tags: Filter by tags (use full URN)
175
+ - Examples: ["urn:li:tag:critical"], ["urn:li:tag:deprecated"]
176
+
177
+ glossary_terms: Filter by glossary terms (use full URN)
178
+ - Examples: ["urn:li:glossaryTerm:pii"], ["urn:li:glossaryTerm:gdpr"]
179
+
180
+ owners: Filter by document owners (use full URN)
181
+ - Examples: ["urn:li:corpuser:alice"], ["urn:li:corpGroup:platform-team"]
182
+
183
+ PAGINATION:
184
+ - num_results: Number of results per page (max: 50)
185
+ - offset: Starting position (default: 0)
186
+
187
+ FACET DISCOVERY:
188
+ - Set num_results=0 to get ONLY facets (no results)
189
+ - Useful for discovering what sub_types, platforms, domains exist
190
+
191
+ EXAMPLE WORKFLOWS:
192
+
193
+ 1. Find all runbooks:
194
+ search_documents(sub_types=["Runbook"])
195
+
196
+ 2. Find Notion docs about deployment:
197
+ search_documents(query="deployment", platforms=["urn:li:dataPlatform:notion"])
198
+
199
+ 3. Discover document types:
200
+ search_documents(num_results=0)
201
+ → Examine facets to see available subTypes, platforms, domains
202
+
203
+ 4. Find engineering team's critical docs:
204
+ search_documents(
205
+ domains=["urn:li:domain:engineering"],
206
+ tags=["urn:li:tag:critical"]
207
+ )
208
+ """
209
+ # Cap num_results at 50
210
+ num_results = min(num_results, 50)
211
+
212
+ # Build orFilters from the simple filter parameters
213
+ # Each filter type is ANDed together, values within a filter are ORed
214
+ and_filters: List[Dict[str, Any]] = []
215
+
216
+ if sub_types:
217
+ and_filters.append({"field": "subTypes", "values": sub_types})
218
+ if platforms:
219
+ and_filters.append({"field": "platform", "values": platforms})
220
+ if domains:
221
+ and_filters.append({"field": "domains", "values": domains})
222
+ if tags:
223
+ and_filters.append({"field": "tags", "values": tags})
224
+ if glossary_terms:
225
+ and_filters.append({"field": "glossaryTerms", "values": glossary_terms})
226
+ if owners:
227
+ and_filters.append({"field": "owners", "values": owners})
228
+
229
+ # Wrap in orFilters format (list of AND groups)
230
+ or_filters = [{"and": and_filters}] if and_filters else []
231
+
232
+ # Fetch and apply default view
233
+ view_urn = fetch_global_default_view(graph)
234
+
235
+ # Choose search strategy
236
+ if search_strategy == "semantic":
237
+ gql_query = document_semantic_search_gql
238
+ operation_name = "documentSemanticSearch"
239
+ response_key = "semanticSearchAcrossEntities"
240
+ variables: Dict[str, Any] = {
241
+ "query": query,
242
+ "orFilters": or_filters,
243
+ "count": max(num_results, 1),
244
+ "viewUrn": view_urn,
245
+ }
246
+ else:
247
+ # Default: keyword search
248
+ gql_query = document_search_gql
249
+ operation_name = "documentSearch"
250
+ response_key = "searchAcrossEntities"
251
+ variables = {
252
+ "query": query,
253
+ "orFilters": or_filters,
254
+ "count": max(num_results, 1),
255
+ "start": offset,
256
+ "viewUrn": view_urn,
257
+ }
258
+
259
+ response = execute_graphql(
260
+ graph,
261
+ query=gql_query,
262
+ variables=variables,
263
+ operation_name=operation_name,
264
+ )[response_key]
265
+
266
+ if num_results == 0 and isinstance(response, dict):
267
+ # Support num_results=0 for facet-only queries
268
+ response.pop("searchResults", None)
269
+ response.pop("count", None)
270
+
271
+ return clean_gql_response(response)
272
+
273
+
274
+ def grep_documents(
275
+ urns: List[str],
276
+ pattern: str,
277
+ context_chars: int = 200,
278
+ max_matches_per_doc: int = 5,
279
+ start_offset: int = 0,
280
+ ) -> dict:
281
+ """Search within document content using regex patterns.
282
+
283
+ Similar to ripgrep/grep - finds matching excerpts within documents.
284
+ Use search_documents() first to find relevant document URNs, then use this
285
+ tool to search within their content.
286
+
287
+ PATTERN SYNTAX (RE2 regex):
288
+ - Simple text: "deploy" matches the word deploy
289
+ - Case insensitive: "(?i)deploy" matches Deploy, DEPLOY, deploy
290
+ - Word boundaries: r"\\bword\\b" matches whole word only
291
+ - Alternatives: "deploy|release" matches either term
292
+ - Wildcards: "deploy.*prod" matches deploy followed by prod
293
+ - Character classes: "[Dd]eploy" matches Deploy or deploy
294
+
295
+ PARAMETERS:
296
+
297
+ urns: List of document URNs to search within
298
+ - Get these from search_documents() results
299
+ - Example: ["urn:li:document:doc1", "urn:li:document:doc2"]
300
+
301
+ pattern: Regex pattern to search for
302
+ - Examples: "kubernetes", "(?i)deploy.*production", "error|warning"
303
+ - Use ".*" to get raw content (for continuing after truncation)
304
+
305
+ context_chars: Characters to show before/after each match (default: 200)
306
+ - Higher values show more surrounding context
307
+ - When reading raw content (pattern=".*"), use higher values (e.g., 8000)
308
+
309
+ max_matches_per_doc: Maximum matches to return per document (default: 5)
310
+ - Limits output size for documents with many matches
311
+
312
+ start_offset: Character offset to start searching from (default: 0)
313
+ - Use this to continue reading after get_entities() truncation
314
+ - When get_entities() returns _truncatedAtChar=8000, use start_offset=8000
315
+ to continue reading from where it left off
316
+
317
+ EXAMPLE WORKFLOWS:
318
+
319
+ 1. Find deployment instructions:
320
+ docs = search_documents(query="deployment", sub_types=["Runbook"])
321
+ urns = [r["entity"]["urn"] for r in docs["searchResults"]]
322
+ grep_documents(urns, pattern="kubectl apply", context_chars=300)
323
+
324
+ 2. Find all error handling sections (case insensitive):
325
+ grep_documents(urns, pattern="(?i)error|exception|failure")
326
+
327
+ 3. Find specific configuration values:
328
+ grep_documents(urns, pattern=r"timeout.*=.*\\d+")
329
+
330
+ 4. Continue reading after truncation (when get_entities returns _truncatedAtChar):
331
+ # After get_entities() shows: _truncatedAtChar=8000, _originalLengthChars=15000
332
+ grep_documents(urns=[doc_urn], pattern=".*", context_chars=8000, start_offset=8000)
333
+ # Returns content from char 8000 onwards
334
+
335
+ RETURNS:
336
+ - results: List of documents with matches, each containing:
337
+ - urn: Document URN
338
+ - title: Document title
339
+ - matches: List of excerpts with position info (positions are absolute)
340
+ - total_matches: Total matches found (may exceed max_matches_per_doc)
341
+ - content_length: Total length of document content (when start_offset is used)
342
+ - total_matches: Total matches across all documents
343
+ - documents_with_matches: Number of documents containing matches
344
+
345
+ Args:
346
+ urns: List of document URNs to search within
347
+ pattern: Regex pattern to search for
348
+ context_chars: Characters to show before/after each match
349
+ max_matches_per_doc: Maximum matches to return per document
350
+ start_offset: Character offset to start searching from
351
+
352
+ Returns:
353
+ Dictionary with search results
354
+
355
+ Example:
356
+ from datahub_agent_context.context import DataHubContext
357
+
358
+ with DataHubContext(client.graph):
359
+ result = grep_documents(urns=["urn:li:document:doc1"], pattern="kubernetes")
360
+ """
361
+ graph = get_graph()
362
+ if not urns:
363
+ return {
364
+ "results": [],
365
+ "total_matches": 0,
366
+ "documents_with_matches": 0,
367
+ }
368
+
369
+ # Fetch document content via GraphQL
370
+ response = execute_graphql(
371
+ graph,
372
+ query=read_documents_gql,
373
+ variables={"urns": urns},
374
+ operation_name="documentContent",
375
+ )
376
+
377
+ entities = response.get("entities", [])
378
+
379
+ # Compile regex pattern using RE2 (safe against ReDoS attacks)
380
+ # RE2 guarantees linear-time matching, preventing pathological backtracking
381
+ try:
382
+ regex = re2.compile(pattern)
383
+ except re2.error as e:
384
+ return {
385
+ "error": f"Invalid regex pattern: {e}",
386
+ "results": [],
387
+ "total_matches": 0,
388
+ "documents_with_matches": 0,
389
+ }
390
+
391
+ results = []
392
+ total_matches = 0
393
+ documents_with_matches = 0
394
+
395
+ for entity in entities:
396
+ if not entity:
397
+ continue
398
+
399
+ urn = entity.get("urn", "")
400
+ info = entity.get("info", {})
401
+ title = info.get("title", "Untitled")
402
+ contents = info.get("contents", {})
403
+ text = contents.get("text", "") if contents else ""
404
+
405
+ if not text:
406
+ continue
407
+
408
+ # Store original length before applying offset
409
+ full_content_length = len(text)
410
+
411
+ # Apply start_offset - skip first N characters
412
+ if start_offset > 0:
413
+ if start_offset >= len(text):
414
+ # Offset is beyond document length, skip this document
415
+ continue
416
+ text = text[start_offset:]
417
+
418
+ # Iterate through matches - only store excerpts for first max_matches_per_doc,
419
+ # but count all matches without keeping them in memory
420
+ excerpts: List[Dict[str, Any]] = []
421
+ doc_total_matches = 0
422
+
423
+ for match in regex.finditer(text):
424
+ doc_total_matches += 1
425
+
426
+ # Only extract excerpts for first max_matches_per_doc matches
427
+ if len(excerpts) < max_matches_per_doc:
428
+ start_pos = max(0, match.start() - context_chars)
429
+ end_pos = min(len(text), match.end() + context_chars)
430
+
431
+ # Extract excerpt
432
+ excerpt = text[start_pos:end_pos]
433
+
434
+ # Add ellipsis if truncated
435
+ if start_pos > 0:
436
+ excerpt = "..." + excerpt
437
+ if end_pos < len(text):
438
+ excerpt = excerpt + "..."
439
+
440
+ # Report absolute position (accounting for start_offset)
441
+ absolute_position = match.start() + start_offset
442
+
443
+ excerpts.append(
444
+ {
445
+ "excerpt": excerpt,
446
+ "position": absolute_position,
447
+ }
448
+ )
449
+
450
+ if doc_total_matches == 0:
451
+ continue
452
+
453
+ documents_with_matches += 1
454
+ total_matches += doc_total_matches
455
+
456
+ result_entry: Dict[str, Any] = {
457
+ "urn": urn,
458
+ "title": title,
459
+ "matches": excerpts,
460
+ "total_matches": doc_total_matches,
461
+ }
462
+
463
+ # Include content_length when using start_offset to help with pagination
464
+ if start_offset > 0:
465
+ result_entry["content_length"] = full_content_length
466
+
467
+ results.append(result_entry)
468
+
469
+ return {
470
+ "results": results,
471
+ "total_matches": total_matches,
472
+ "documents_with_matches": documents_with_matches,
473
+ }