datahub-agent-context 1.3.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datahub_agent_context/__init__.py +25 -0
- datahub_agent_context/_version.py +16 -0
- datahub_agent_context/context.py +97 -0
- datahub_agent_context/langchain_tools/__init__.py +8 -0
- datahub_agent_context/langchain_tools/builder.py +127 -0
- datahub_agent_context/mcp_tools/__init__.py +46 -0
- datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
- datahub_agent_context/mcp_tools/base.py +325 -0
- datahub_agent_context/mcp_tools/descriptions.py +299 -0
- datahub_agent_context/mcp_tools/documents.py +473 -0
- datahub_agent_context/mcp_tools/domains.py +246 -0
- datahub_agent_context/mcp_tools/entities.py +349 -0
- datahub_agent_context/mcp_tools/get_me.py +99 -0
- datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
- datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
- datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
- datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
- datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
- datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
- datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
- datahub_agent_context/mcp_tools/gql/search.gql +242 -0
- datahub_agent_context/mcp_tools/helpers.py +448 -0
- datahub_agent_context/mcp_tools/lineage.py +698 -0
- datahub_agent_context/mcp_tools/owners.py +318 -0
- datahub_agent_context/mcp_tools/queries.py +191 -0
- datahub_agent_context/mcp_tools/search.py +239 -0
- datahub_agent_context/mcp_tools/structured_properties.py +447 -0
- datahub_agent_context/mcp_tools/tags.py +296 -0
- datahub_agent_context/mcp_tools/terms.py +295 -0
- datahub_agent_context/py.typed +2 -0
- datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
- datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
- datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
- datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Search tools for DataHub."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import pathlib
|
|
6
|
+
from typing import Any, Dict, Literal, Optional
|
|
7
|
+
|
|
8
|
+
from datahub.sdk.search_client import compile_filters
|
|
9
|
+
from datahub.sdk.search_filters import Filter, load_filters
|
|
10
|
+
from datahub_agent_context.context import get_graph
|
|
11
|
+
from datahub_agent_context.mcp_tools.base import (
|
|
12
|
+
clean_gql_response,
|
|
13
|
+
execute_graphql,
|
|
14
|
+
fetch_global_default_view,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Load GraphQL queries
|
|
20
|
+
_gql_dir = pathlib.Path(__file__).parent / "gql"
|
|
21
|
+
search_gql = (_gql_dir / "search.gql").read_text()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _convert_custom_filter_format(filters_obj: Any) -> Any:
|
|
25
|
+
"""
|
|
26
|
+
Convert chatbot's intuitive {"custom": {...}} format to the format expected by _CustomCondition.
|
|
27
|
+
|
|
28
|
+
Transforms:
|
|
29
|
+
{"custom": {"field": "urn", "condition": "EQUAL", "values": [...]}}
|
|
30
|
+
|
|
31
|
+
Into:
|
|
32
|
+
{"field": "urn", "condition": "EQUAL", "values": [...]}
|
|
33
|
+
|
|
34
|
+
This allows the discriminator to correctly identify it as _custom.
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(filters_obj, dict):
|
|
37
|
+
# Check if this is a "custom" or "custom_condition" wrapper that needs unwrapping
|
|
38
|
+
if len(filters_obj) == 1 and (
|
|
39
|
+
"custom" in filters_obj or "custom_condition" in filters_obj
|
|
40
|
+
):
|
|
41
|
+
wrapper_key = "custom" if "custom" in filters_obj else "custom_condition"
|
|
42
|
+
custom_content = filters_obj[wrapper_key]
|
|
43
|
+
# Ensure it has the expected structure for _CustomCondition
|
|
44
|
+
if isinstance(custom_content, dict) and "field" in custom_content:
|
|
45
|
+
return custom_content
|
|
46
|
+
|
|
47
|
+
# Recursively process nested filters (for "and", "or", etc.)
|
|
48
|
+
result = {}
|
|
49
|
+
for key, value in filters_obj.items():
|
|
50
|
+
if isinstance(value, (list, dict)):
|
|
51
|
+
result[key] = _convert_custom_filter_format(value)
|
|
52
|
+
else:
|
|
53
|
+
result[key] = value
|
|
54
|
+
return result
|
|
55
|
+
elif isinstance(filters_obj, list):
|
|
56
|
+
# Process list of filters
|
|
57
|
+
return [_convert_custom_filter_format(item) for item in filters_obj]
|
|
58
|
+
else:
|
|
59
|
+
# Return primitive values unchanged
|
|
60
|
+
return filters_obj
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def search(
|
|
64
|
+
query: str = "*",
|
|
65
|
+
filters: Optional[Filter | str] = None,
|
|
66
|
+
num_results: int = 10,
|
|
67
|
+
sort_by: Optional[str] = None,
|
|
68
|
+
sort_order: Optional[Literal["asc", "desc"]] = "desc",
|
|
69
|
+
offset: int = 0,
|
|
70
|
+
) -> dict:
|
|
71
|
+
"""Search across DataHub entities using structured full-text search.
|
|
72
|
+
Results are ordered by relevance and importance - examine top results first.
|
|
73
|
+
|
|
74
|
+
SEARCH SYNTAX:
|
|
75
|
+
- Structured full-text search - **always start queries with /q**
|
|
76
|
+
- **Recommended: Use + operator for AND** (handles punctuation better than quotes)
|
|
77
|
+
- Supports full boolean logic: AND (default), OR, NOT, parentheses, field searches
|
|
78
|
+
- Examples:
|
|
79
|
+
• /q user+transaction → requires both terms (better for field names with _ or punctuation)
|
|
80
|
+
• /q point+sale+app → requires all terms (works with point_of_sale_app_usage)
|
|
81
|
+
• /q wizard OR pet → entities containing either term
|
|
82
|
+
• /q revenue* → wildcard matching (revenue_2023, revenue_2024, revenue_monthly, etc.)
|
|
83
|
+
• /q tag:PII → search by tag name
|
|
84
|
+
• /q "exact table name" → exact phrase matching (use sparingly)
|
|
85
|
+
• /q (sales OR revenue) AND quarterly → complex boolean combinations
|
|
86
|
+
- Fast and precise for exact matching, technical terms, and complex queries
|
|
87
|
+
- Best for: entity names, identifiers, column names, or any search needing boolean logic
|
|
88
|
+
|
|
89
|
+
PAGINATION:
|
|
90
|
+
- num_results: Number of results to return per page (max: 50)
|
|
91
|
+
- offset: Starting position in results (default: 0)
|
|
92
|
+
- Examples:
|
|
93
|
+
• First page: offset=0, num_results=10
|
|
94
|
+
• Second page: offset=10, num_results=10
|
|
95
|
+
• Third page: offset=20, num_results=10
|
|
96
|
+
|
|
97
|
+
FACET EXPLORATION - Discover metadata without returning results:
|
|
98
|
+
- Set num_results=0 to get ONLY facets (no search results)
|
|
99
|
+
- Facets show ALL tags, glossaryTerms, platforms, domains used in the catalog
|
|
100
|
+
- Example: search(query="*", filters={"entity_type": ["DATASET"]}, num_results=0)
|
|
101
|
+
→ Returns facets showing all tags/glossaryTerms applied to datasets
|
|
102
|
+
- Use this to discover what metadata exists before doing filtered searches
|
|
103
|
+
|
|
104
|
+
TYPICAL WORKFLOW:
|
|
105
|
+
1. Facet exploration: search(query="*", filters={"entity_type": ["DATASET"]}, num_results=0)
|
|
106
|
+
→ Examine tags/glossaryTerms facets to see what metadata exists
|
|
107
|
+
2. Filtered search: search(query="*", filters={"tag": ["urn:li:tag:pii"]}, num_results=30)
|
|
108
|
+
→ Get entities with specific tag using URN from step 1
|
|
109
|
+
3. Get details: Use get_entities() on specific results
|
|
110
|
+
|
|
111
|
+
Here are some example filters:
|
|
112
|
+
- All Looker assets
|
|
113
|
+
```
|
|
114
|
+
{"platform": ["looker"]}
|
|
115
|
+
```
|
|
116
|
+
- Production environment warehouse assets
|
|
117
|
+
```
|
|
118
|
+
{
|
|
119
|
+
"and": [
|
|
120
|
+
{"env": ["PROD"]},
|
|
121
|
+
{"platform": ["snowflake", "bigquery", "redshift"]}
|
|
122
|
+
]
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
- Filter by domain (MUST use full URN format)
|
|
126
|
+
```
|
|
127
|
+
{"domain": ["urn:li:domain:marketing"]}
|
|
128
|
+
{"domain": ["urn:li:domain:9f8e7d6c-5b4a-3928-1765-432109876543", "urn:li:domain:7c6b5a49-3827-1654-9032-8f7e6d5c4b3a"]}
|
|
129
|
+
```
|
|
130
|
+
IMPORTANT: Domain filters require full URN format starting with "urn:li:domain:",
|
|
131
|
+
NOT short names like "marketing" or "customer". Domain URNs can be readable names
|
|
132
|
+
or GUIDs. Always search with {"entity_type": ["domain"]}
|
|
133
|
+
to find valid domain URNs first, then use the exact URN from the results.
|
|
134
|
+
|
|
135
|
+
SUPPORTED FILTER TYPES (only these will work):
|
|
136
|
+
- entity_type: ["dataset"], ["dashboard", "chart"], ["corp_user"], ["corp_group"]
|
|
137
|
+
- entity_subtype: ["Table"], ["View", "Model"]
|
|
138
|
+
- platform: ["snowflake"], ["looker", "tableau"]
|
|
139
|
+
- domain: ["urn:li:domain:marketing"] (full URN required)
|
|
140
|
+
- container: ["urn:li:container:..."] (full URN required)
|
|
141
|
+
- tag: ["urn:li:tag:PII"] (full tag URN required)
|
|
142
|
+
- glossary_term: ["urn:li:glossaryTerm:uuid"] (full term URN required)
|
|
143
|
+
- owner: ["urn:li:corpuser:alice", "urn:li:corpGroup:marketing"] (full user or group URN required)
|
|
144
|
+
- custom: {"field": "fieldName", "condition": "EQUAL", "values": [...]}
|
|
145
|
+
- status: ["NOT_SOFT_DELETED"] (for non-deleted entities)
|
|
146
|
+
- env: ["PROD"], ["DEV", "STAGING"] (Should not use unless explicitly requested)
|
|
147
|
+
- and: [filter1, filter2] (combines multiple filters)
|
|
148
|
+
- or: [filter1, filter2] (matches any filter)
|
|
149
|
+
- not: {"entity_type": ["dataset"]} (excludes matches)
|
|
150
|
+
|
|
151
|
+
CRITICAL: Use only ONE discriminator key per filter object. Never mix
|
|
152
|
+
entity_type with custom, domain, etc. at the same level. Use "and" or "or" to combine.
|
|
153
|
+
|
|
154
|
+
SEARCH STRATEGY EXAMPLES:
|
|
155
|
+
- /q customer+behavior → finds tables with both terms (works with customer_behavior fields)
|
|
156
|
+
- /q customer OR user → finds tables with either term
|
|
157
|
+
- /q (financial OR revenue) AND metrics → complex boolean logic
|
|
158
|
+
|
|
159
|
+
SORTING - Order results by specific fields:
|
|
160
|
+
- sort_by: Field name to sort by (optional)
|
|
161
|
+
- sort_order: "desc" (default) or "asc"
|
|
162
|
+
|
|
163
|
+
Note: If sort_by is not provided, search results use default ranking by relevance and
|
|
164
|
+
importance. When using sort_by, results are strictly ordered by that field.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
query: Search query string (use /q prefix for structured queries)
|
|
168
|
+
filters: Optional filter object or JSON string
|
|
169
|
+
num_results: Number of results to return (max 50)
|
|
170
|
+
sort_by: Optional field name to sort by
|
|
171
|
+
sort_order: Sort order ("asc" or "desc")
|
|
172
|
+
offset: Starting position for pagination
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dictionary with search results, facets, and metadata
|
|
176
|
+
|
|
177
|
+
Example:
|
|
178
|
+
from datahub_agent_context.context import DataHubContext
|
|
179
|
+
|
|
180
|
+
with DataHubContext(client.graph):
|
|
181
|
+
result = search(query="/q users", filters={"entity_type": ["dataset"]})
|
|
182
|
+
"""
|
|
183
|
+
graph = get_graph()
|
|
184
|
+
# Cap num_results at 50 to prevent excessive requests
|
|
185
|
+
num_results = min(num_results, 50)
|
|
186
|
+
|
|
187
|
+
# Handle stringified JSON filters or dict filters
|
|
188
|
+
if isinstance(filters, str):
|
|
189
|
+
# Parse JSON first to allow preprocessing
|
|
190
|
+
filters_dict = json.loads(filters)
|
|
191
|
+
|
|
192
|
+
# Convert "custom" wrapper to direct _custom format for compatibility
|
|
193
|
+
filters_dict = _convert_custom_filter_format(filters_dict)
|
|
194
|
+
|
|
195
|
+
filters = load_filters(filters_dict)
|
|
196
|
+
elif isinstance(filters, dict):
|
|
197
|
+
# Convert dict to Filter object
|
|
198
|
+
filters_dict = _convert_custom_filter_format(filters)
|
|
199
|
+
filters = load_filters(filters_dict)
|
|
200
|
+
|
|
201
|
+
types, compiled_filters = compile_filters(filters)
|
|
202
|
+
|
|
203
|
+
# Fetch and apply default view (returns None if disabled or not configured)
|
|
204
|
+
view_urn = fetch_global_default_view(graph)
|
|
205
|
+
if view_urn:
|
|
206
|
+
logger.debug(f"Applying default view: {view_urn}")
|
|
207
|
+
else:
|
|
208
|
+
logger.debug("No default view to apply")
|
|
209
|
+
|
|
210
|
+
variables: Dict[str, Any] = {
|
|
211
|
+
"query": query,
|
|
212
|
+
"types": types,
|
|
213
|
+
"orFilters": compiled_filters,
|
|
214
|
+
"count": max(num_results, 1), # 0 is not a valid value for count.
|
|
215
|
+
"start": offset,
|
|
216
|
+
"viewUrn": view_urn, # Will be None if disabled or not set
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Add sorting if requested
|
|
220
|
+
if sort_by is not None:
|
|
221
|
+
sort_order_enum = "ASCENDING" if sort_order == "asc" else "DESCENDING"
|
|
222
|
+
variables["sortInput"] = {
|
|
223
|
+
"sortCriteria": [{"field": sort_by, "sortOrder": sort_order_enum}]
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# Use keyword search
|
|
227
|
+
response = execute_graphql(
|
|
228
|
+
graph,
|
|
229
|
+
query=search_gql,
|
|
230
|
+
variables=variables,
|
|
231
|
+
operation_name="search",
|
|
232
|
+
)["searchAcrossEntities"]
|
|
233
|
+
|
|
234
|
+
if num_results == 0 and isinstance(response, dict):
|
|
235
|
+
# Hack to support num_results=0 without support for it in the backend.
|
|
236
|
+
response.pop("searchResults", None)
|
|
237
|
+
response.pop("count", None)
|
|
238
|
+
|
|
239
|
+
return clean_gql_response(response)
|
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
"""Structured property management tools for DataHub MCP server."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, List, Union
|
|
6
|
+
|
|
7
|
+
from datahub.utilities.urns._urn_base import Urn
|
|
8
|
+
from datahub_agent_context.context import get_graph
|
|
9
|
+
from datahub_agent_context.mcp_tools.base import execute_graphql
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate_and_fetch_structured_property(property_urn: str) -> Dict:
|
|
15
|
+
"""
|
|
16
|
+
Validate that the structured property exists and fetch its definition.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with property definition including valueType and entityTypes
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If the property URN does not exist or is invalid
|
|
23
|
+
"""
|
|
24
|
+
graph = get_graph()
|
|
25
|
+
query = """
|
|
26
|
+
query getStructuredProperty($urn: String!) {
|
|
27
|
+
entity(urn: $urn) {
|
|
28
|
+
urn
|
|
29
|
+
type
|
|
30
|
+
... on StructuredPropertyEntity {
|
|
31
|
+
definition {
|
|
32
|
+
qualifiedName
|
|
33
|
+
entityTypes {
|
|
34
|
+
urn
|
|
35
|
+
type
|
|
36
|
+
info {
|
|
37
|
+
type
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
valueType {
|
|
41
|
+
urn
|
|
42
|
+
info {
|
|
43
|
+
qualifiedName
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
cardinality
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
result = execute_graphql(
|
|
55
|
+
graph,
|
|
56
|
+
query=query,
|
|
57
|
+
variables={"urn": property_urn},
|
|
58
|
+
operation_name="getStructuredProperty",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
entity = result.get("entity")
|
|
62
|
+
|
|
63
|
+
if entity is None:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Structured property URN does not exist in DataHub: {property_urn}. "
|
|
66
|
+
f"Please use the search tool to find existing structured properties, "
|
|
67
|
+
f"or create the property first before assigning it."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if entity.get("type") != "STRUCTURED_PROPERTY":
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"The URN is not a structured property entity: {property_urn} (type: {entity.get('type')})"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return entity.get("definition", {})
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
if isinstance(e, ValueError):
|
|
79
|
+
raise
|
|
80
|
+
raise ValueError(f"Failed to validate structured property URN: {str(e)}") from e
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _validate_property_value(
|
|
84
|
+
property_definition: Dict, value: Union[str, float, int]
|
|
85
|
+
) -> Dict:
|
|
86
|
+
"""
|
|
87
|
+
Validate and convert a property value to the appropriate GraphQL format.
|
|
88
|
+
|
|
89
|
+
Supports 5 data types:
|
|
90
|
+
- datahub.string: Plain text strings
|
|
91
|
+
- datahub.number: Numeric values (int, float, double, long)
|
|
92
|
+
- datahub.urn: DataHub URN references
|
|
93
|
+
- datahub.date: ISO 8601 date strings
|
|
94
|
+
- datahub.rich_text: Rich text/markdown content
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
property_definition: The property definition containing valueType info
|
|
98
|
+
value: The value to validate and convert
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dictionary with either stringValue or numberValue key
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If the value type doesn't match the property's valueType
|
|
105
|
+
"""
|
|
106
|
+
value_type_info = property_definition.get("valueType", {}).get("info", {})
|
|
107
|
+
qualified_name = value_type_info.get("qualifiedName", "").lower()
|
|
108
|
+
|
|
109
|
+
# Determine the data type
|
|
110
|
+
is_numeric_type = any(
|
|
111
|
+
numeric_type in qualified_name
|
|
112
|
+
for numeric_type in ["number", "int", "float", "double", "long"]
|
|
113
|
+
)
|
|
114
|
+
is_urn_type = "urn" in qualified_name and "datahub.urn" in qualified_name
|
|
115
|
+
is_date_type = "date" in qualified_name
|
|
116
|
+
is_rich_text_type = "rich_text" in qualified_name or "richtext" in qualified_name
|
|
117
|
+
|
|
118
|
+
if is_numeric_type:
|
|
119
|
+
# Value should be numeric
|
|
120
|
+
if isinstance(value, (int, float)):
|
|
121
|
+
return {"numberValue": float(value)}
|
|
122
|
+
elif isinstance(value, str):
|
|
123
|
+
try:
|
|
124
|
+
return {"numberValue": float(value)}
|
|
125
|
+
except ValueError as e:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Property expects numeric type ({qualified_name}), but got non-numeric string: {value}"
|
|
128
|
+
) from e
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"Property expects numeric type ({qualified_name}), got {type(value).__name__}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
elif is_urn_type:
|
|
135
|
+
# Value should be a valid DataHub URN
|
|
136
|
+
if not isinstance(value, str):
|
|
137
|
+
value = str(value)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
# Validate URN format
|
|
141
|
+
Urn.from_string(value)
|
|
142
|
+
return {"stringValue": value}
|
|
143
|
+
except Exception as e:
|
|
144
|
+
raise ValueError(
|
|
145
|
+
f"Property expects URN type ({qualified_name}), but got invalid URN: {value}. "
|
|
146
|
+
f"URNs must be in format 'urn:li:entityType:...' Error: {str(e)}"
|
|
147
|
+
) from e
|
|
148
|
+
|
|
149
|
+
elif is_date_type:
|
|
150
|
+
# Value should be an ISO 8601 date string
|
|
151
|
+
if not isinstance(value, str):
|
|
152
|
+
value = str(value)
|
|
153
|
+
|
|
154
|
+
# Try to parse as ISO 8601 date
|
|
155
|
+
try:
|
|
156
|
+
# Support various ISO 8601 formats
|
|
157
|
+
# Examples: 2024-12-22, 2024-12-22T10:30:00, 2024-12-22T10:30:00Z, 2024-12-22T10:30:00+00:00
|
|
158
|
+
datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
159
|
+
return {"stringValue": value}
|
|
160
|
+
except ValueError as e:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"Property expects date type ({qualified_name}), but got invalid date format: {value}. "
|
|
163
|
+
f"Dates must be in ISO 8601 format (e.g., '2024-12-22', '2024-12-22T10:30:00Z')"
|
|
164
|
+
) from e
|
|
165
|
+
|
|
166
|
+
elif is_rich_text_type:
|
|
167
|
+
# Value should be string (can contain markdown/HTML)
|
|
168
|
+
if isinstance(value, str):
|
|
169
|
+
return {"stringValue": value}
|
|
170
|
+
else:
|
|
171
|
+
# Convert to string for non-string types
|
|
172
|
+
return {"stringValue": str(value)}
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
# Default to string type (datahub.string or unknown types)
|
|
176
|
+
if isinstance(value, str):
|
|
177
|
+
return {"stringValue": value}
|
|
178
|
+
else:
|
|
179
|
+
# Convert to string for non-string types
|
|
180
|
+
return {"stringValue": str(value)}
|
|
181
|
+
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"Value type mismatch: property expects {qualified_name}, got {type(value).__name__}"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def add_structured_properties(
|
|
188
|
+
property_values: Dict[str, List[Union[str, float, int]]],
|
|
189
|
+
entity_urns: List[str],
|
|
190
|
+
) -> dict:
|
|
191
|
+
"""Add structured properties with values to multiple DataHub entities.
|
|
192
|
+
|
|
193
|
+
This tool allows you to assign structured properties to multiple entities in a single operation.
|
|
194
|
+
Structured properties are schema-defined metadata fields that can store typed values (strings, numbers, etc.).
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
property_values: Dictionary mapping structured property URNs to lists of values.
|
|
198
|
+
Example: {
|
|
199
|
+
"urn:li:structuredProperty:io.acryl.privacy.retentionTime": ["90"],
|
|
200
|
+
"urn:li:structuredProperty:io.acryl.common.businessCriticality": ["HIGH"]
|
|
201
|
+
}
|
|
202
|
+
entity_urns: List of entity URNs to assign properties to (e.g., dataset URNs, dashboard URNs)
|
|
203
|
+
|
|
204
|
+
Examples:
|
|
205
|
+
# Add retention time and criticality to datasets
|
|
206
|
+
add_structured_properties(
|
|
207
|
+
property_values={
|
|
208
|
+
"urn:li:structuredProperty:io.acryl.privacy.retentionTime": ["90"],
|
|
209
|
+
"urn:li:structuredProperty:io.acryl.common.businessCriticality": ["HIGH"]
|
|
210
|
+
},
|
|
211
|
+
entity_urns=[
|
|
212
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
213
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Add numeric property
|
|
218
|
+
add_structured_properties(
|
|
219
|
+
property_values={
|
|
220
|
+
"urn:li:structuredProperty:io.acryl.dataQuality.scoreThreshold": [0.95]
|
|
221
|
+
},
|
|
222
|
+
entity_urns=[
|
|
223
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.verified_data,PROD)"
|
|
224
|
+
]
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Add multiple values for a multi-valued property
|
|
228
|
+
add_structured_properties(
|
|
229
|
+
property_values={
|
|
230
|
+
"urn:li:structuredProperty:io.acryl.common.dataClassification": ["PII", "SENSITIVE"]
|
|
231
|
+
},
|
|
232
|
+
entity_urns=[
|
|
233
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)"
|
|
234
|
+
]
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
from datahub_agent_context.context import DataHubContext
|
|
239
|
+
|
|
240
|
+
with DataHubContext(client.graph):
|
|
241
|
+
result = add_structured_properties(
|
|
242
|
+
property_values={"urn:li:structuredProperty:...": ["value"]},
|
|
243
|
+
entity_urns=["urn:li:dataset:(...)"]
|
|
244
|
+
)
|
|
245
|
+
"""
|
|
246
|
+
graph = get_graph()
|
|
247
|
+
if not property_values:
|
|
248
|
+
raise ValueError("property_values cannot be empty")
|
|
249
|
+
if not entity_urns:
|
|
250
|
+
raise ValueError("entity_urns cannot be empty")
|
|
251
|
+
|
|
252
|
+
# Validate all structured properties and fetch their definitions
|
|
253
|
+
property_definitions = {}
|
|
254
|
+
for property_urn in property_values:
|
|
255
|
+
property_definitions[property_urn] = _validate_and_fetch_structured_property(
|
|
256
|
+
property_urn
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Build structured property input params with type validation
|
|
260
|
+
structured_property_params = []
|
|
261
|
+
for property_urn, values in property_values.items():
|
|
262
|
+
property_def = property_definitions[property_urn]
|
|
263
|
+
|
|
264
|
+
# Validate and convert each value
|
|
265
|
+
converted_values = []
|
|
266
|
+
for value in values:
|
|
267
|
+
try:
|
|
268
|
+
converted_value = _validate_property_value(property_def, value)
|
|
269
|
+
converted_values.append(converted_value)
|
|
270
|
+
except ValueError as e:
|
|
271
|
+
raise ValueError(
|
|
272
|
+
f"Value validation failed for {property_urn}: {str(e)}"
|
|
273
|
+
) from e
|
|
274
|
+
|
|
275
|
+
structured_property_params.append(
|
|
276
|
+
{"structuredPropertyUrn": property_urn, "values": converted_values}
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Execute upsert for each entity
|
|
280
|
+
mutation = """
|
|
281
|
+
mutation upsertStructuredProperties($input: UpsertStructuredPropertiesInput!) {
|
|
282
|
+
upsertStructuredProperties(input: $input) {
|
|
283
|
+
properties {
|
|
284
|
+
structuredProperty {
|
|
285
|
+
urn
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
success_count = 0
|
|
293
|
+
failed_urns = []
|
|
294
|
+
error_messages = []
|
|
295
|
+
|
|
296
|
+
for entity_urn in entity_urns:
|
|
297
|
+
variables = {
|
|
298
|
+
"input": {
|
|
299
|
+
"assetUrn": entity_urn,
|
|
300
|
+
"structuredPropertyInputParams": structured_property_params,
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
result = execute_graphql(
|
|
306
|
+
graph,
|
|
307
|
+
query=mutation,
|
|
308
|
+
variables=variables,
|
|
309
|
+
operation_name="upsertStructuredProperties",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if result.get("upsertStructuredProperties"):
|
|
313
|
+
success_count += 1
|
|
314
|
+
else:
|
|
315
|
+
failed_urns.append(entity_urn)
|
|
316
|
+
error_messages.append(
|
|
317
|
+
f"{entity_urn}: operation returned false or empty result"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
failed_urns.append(entity_urn)
|
|
322
|
+
error_messages.append(f"{entity_urn}: {str(e)}")
|
|
323
|
+
|
|
324
|
+
if failed_urns:
|
|
325
|
+
error_details = "; ".join(error_messages[:3])
|
|
326
|
+
if len(error_messages) > 3:
|
|
327
|
+
error_details += f"; and {len(error_messages) - 3} more error(s)"
|
|
328
|
+
raise RuntimeError(
|
|
329
|
+
f"Failed to add structured properties to {len(failed_urns)} entit(ies). Errors: {error_details}"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
"success": True,
|
|
334
|
+
"message": f"Successfully added {len(property_values)} structured propert(ies) to {success_count} entit(ies)",
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def remove_structured_properties(
|
|
339
|
+
property_urns: List[str],
|
|
340
|
+
entity_urns: List[str],
|
|
341
|
+
) -> dict:
|
|
342
|
+
"""Remove structured properties from multiple DataHub entities.
|
|
343
|
+
|
|
344
|
+
This tool allows you to remove structured property assignments from multiple entities in a single operation.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
property_urns: List of structured property URNs to remove
|
|
348
|
+
Example: ["urn:li:structuredProperty:io.acryl.privacy.retentionTime"]
|
|
349
|
+
entity_urns: List of entity URNs to remove properties from (e.g., dataset URNs, dashboard URNs)
|
|
350
|
+
|
|
351
|
+
Examples:
|
|
352
|
+
# Remove retention time property from datasets
|
|
353
|
+
remove_structured_properties(
|
|
354
|
+
property_urns=["urn:li:structuredProperty:io.acryl.privacy.retentionTime"],
|
|
355
|
+
entity_urns=[
|
|
356
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.old_data,PROD)",
|
|
357
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.archived,PROD)"
|
|
358
|
+
]
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Remove multiple properties at once
|
|
362
|
+
remove_structured_properties(
|
|
363
|
+
property_urns=[
|
|
364
|
+
"urn:li:structuredProperty:io.acryl.privacy.retentionTime",
|
|
365
|
+
"urn:li:structuredProperty:io.acryl.common.businessCriticality"
|
|
366
|
+
],
|
|
367
|
+
entity_urns=[
|
|
368
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.temp_table,PROD)"
|
|
369
|
+
]
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
Example:
|
|
373
|
+
from datahub_agent_context.context import DataHubContext
|
|
374
|
+
|
|
375
|
+
with DataHubContext(client.graph):
|
|
376
|
+
result = remove_structured_properties(
|
|
377
|
+
property_urns=["urn:li:structuredProperty:..."],
|
|
378
|
+
entity_urns=["urn:li:dataset:(...)"]
|
|
379
|
+
)
|
|
380
|
+
"""
|
|
381
|
+
graph = get_graph()
|
|
382
|
+
if not property_urns:
|
|
383
|
+
raise ValueError("property_urns cannot be empty")
|
|
384
|
+
if not entity_urns:
|
|
385
|
+
raise ValueError("entity_urns cannot be empty")
|
|
386
|
+
|
|
387
|
+
# Validate all structured properties exist
|
|
388
|
+
for property_urn in property_urns:
|
|
389
|
+
_validate_and_fetch_structured_property(property_urn)
|
|
390
|
+
|
|
391
|
+
# Execute remove for each entity
|
|
392
|
+
mutation = """
|
|
393
|
+
mutation removeStructuredProperties($input: RemoveStructuredPropertiesInput!) {
|
|
394
|
+
removeStructuredProperties(input: $input) {
|
|
395
|
+
properties {
|
|
396
|
+
structuredProperty {
|
|
397
|
+
urn
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
success_count = 0
|
|
405
|
+
failed_urns = []
|
|
406
|
+
error_messages = []
|
|
407
|
+
|
|
408
|
+
for entity_urn in entity_urns:
|
|
409
|
+
variables = {
|
|
410
|
+
"input": {
|
|
411
|
+
"assetUrn": entity_urn,
|
|
412
|
+
"structuredPropertyUrns": property_urns,
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
result = execute_graphql(
|
|
418
|
+
graph,
|
|
419
|
+
query=mutation,
|
|
420
|
+
variables=variables,
|
|
421
|
+
operation_name="removeStructuredProperties",
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if result.get("removeStructuredProperties"):
|
|
425
|
+
success_count += 1
|
|
426
|
+
else:
|
|
427
|
+
failed_urns.append(entity_urn)
|
|
428
|
+
error_messages.append(
|
|
429
|
+
f"{entity_urn}: operation returned false or empty result"
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
except Exception as e:
|
|
433
|
+
failed_urns.append(entity_urn)
|
|
434
|
+
error_messages.append(f"{entity_urn}: {str(e)}")
|
|
435
|
+
|
|
436
|
+
if failed_urns:
|
|
437
|
+
error_details = "; ".join(error_messages[:3])
|
|
438
|
+
if len(error_messages) > 3:
|
|
439
|
+
error_details += f"; and {len(error_messages) - 3} more error(s)"
|
|
440
|
+
raise RuntimeError(
|
|
441
|
+
f"Failed to remove structured properties from {len(failed_urns)} entit(ies). Errors: {error_details}"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
return {
|
|
445
|
+
"success": True,
|
|
446
|
+
"message": f"Successfully removed {len(property_urns)} structured propert(ies) from {success_count} entit(ies)",
|
|
447
|
+
}
|