datahub-agent-context 1.3.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datahub_agent_context/__init__.py +25 -0
- datahub_agent_context/_version.py +16 -0
- datahub_agent_context/context.py +97 -0
- datahub_agent_context/langchain_tools/__init__.py +8 -0
- datahub_agent_context/langchain_tools/builder.py +127 -0
- datahub_agent_context/mcp_tools/__init__.py +46 -0
- datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
- datahub_agent_context/mcp_tools/base.py +325 -0
- datahub_agent_context/mcp_tools/descriptions.py +299 -0
- datahub_agent_context/mcp_tools/documents.py +473 -0
- datahub_agent_context/mcp_tools/domains.py +246 -0
- datahub_agent_context/mcp_tools/entities.py +349 -0
- datahub_agent_context/mcp_tools/get_me.py +99 -0
- datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
- datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
- datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
- datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
- datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
- datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
- datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
- datahub_agent_context/mcp_tools/gql/search.gql +242 -0
- datahub_agent_context/mcp_tools/helpers.py +448 -0
- datahub_agent_context/mcp_tools/lineage.py +698 -0
- datahub_agent_context/mcp_tools/owners.py +318 -0
- datahub_agent_context/mcp_tools/queries.py +191 -0
- datahub_agent_context/mcp_tools/search.py +239 -0
- datahub_agent_context/mcp_tools/structured_properties.py +447 -0
- datahub_agent_context/mcp_tools/tags.py +296 -0
- datahub_agent_context/mcp_tools/terms.py +295 -0
- datahub_agent_context/py.typed +2 -0
- datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
- datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
- datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
- datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Owner management tools for DataHub MCP server."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from datahub_agent_context.context import get_graph
|
|
7
|
+
from datahub_agent_context.mcp_tools.base import execute_graphql
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _validate_owner_urns(owner_urns: List[str]) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Validate that all owner URNs exist in DataHub and are either CorpUser or CorpGroup entities.
|
|
15
|
+
|
|
16
|
+
Raises:
|
|
17
|
+
ValueError: If any owner URN does not exist or is not a valid owner entity type
|
|
18
|
+
"""
|
|
19
|
+
graph = get_graph()
|
|
20
|
+
# Query to check if owners exist and are valid types
|
|
21
|
+
query = """
|
|
22
|
+
query getOwners($urns: [String!]!) {
|
|
23
|
+
entities(urns: $urns) {
|
|
24
|
+
urn
|
|
25
|
+
type
|
|
26
|
+
... on CorpUser {
|
|
27
|
+
username
|
|
28
|
+
}
|
|
29
|
+
... on CorpGroup {
|
|
30
|
+
name
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
result = execute_graphql(
|
|
38
|
+
graph,
|
|
39
|
+
query=query,
|
|
40
|
+
variables={"urns": owner_urns},
|
|
41
|
+
operation_name="getOwners",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
entities = result.get("entities", [])
|
|
45
|
+
|
|
46
|
+
# Build a map of found URNs
|
|
47
|
+
found_urns = {entity["urn"] for entity in entities if entity is not None}
|
|
48
|
+
|
|
49
|
+
# Check for missing owners
|
|
50
|
+
missing_urns = [urn for urn in owner_urns if urn not in found_urns]
|
|
51
|
+
|
|
52
|
+
if missing_urns:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"The following owner URNs do not exist in DataHub: {', '.join(missing_urns)}. "
|
|
55
|
+
f"Please use the search tool with entity_type filter to find existing users or groups, "
|
|
56
|
+
f"or create the owners first before assigning them."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Verify all returned entities are either CorpUser or CorpGroup
|
|
60
|
+
invalid_type_entities = [
|
|
61
|
+
entity["urn"]
|
|
62
|
+
for entity in entities
|
|
63
|
+
if entity and entity.get("type") not in ("CORP_USER", "CORP_GROUP")
|
|
64
|
+
]
|
|
65
|
+
if invalid_type_entities:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"The following URNs are not valid owner entities (must be CorpUser or CorpGroup): {', '.join(invalid_type_entities)}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
if isinstance(e, ValueError):
|
|
72
|
+
raise
|
|
73
|
+
raise ValueError(f"Failed to validate owner URNs: {str(e)}") from e
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _batch_modify_owners(
|
|
77
|
+
owner_urns: List[str],
|
|
78
|
+
entity_urns: List[str],
|
|
79
|
+
ownership_type_urn: Optional[str],
|
|
80
|
+
operation: Literal["add", "remove"],
|
|
81
|
+
) -> dict:
|
|
82
|
+
"""
|
|
83
|
+
Internal helper for batch owner operations (add/remove).
|
|
84
|
+
|
|
85
|
+
Validates inputs, constructs GraphQL mutation, and executes the operation.
|
|
86
|
+
"""
|
|
87
|
+
graph = get_graph()
|
|
88
|
+
# Validate inputs
|
|
89
|
+
if not owner_urns:
|
|
90
|
+
raise ValueError("owner_urns cannot be empty")
|
|
91
|
+
if not entity_urns:
|
|
92
|
+
raise ValueError("entity_urns cannot be empty")
|
|
93
|
+
|
|
94
|
+
# Validate that all owner URNs exist and are valid types
|
|
95
|
+
_validate_owner_urns(owner_urns)
|
|
96
|
+
|
|
97
|
+
# Build the resources list for GraphQL mutation
|
|
98
|
+
resources = []
|
|
99
|
+
for resource_urn in entity_urns:
|
|
100
|
+
resource_input = {"resourceUrn": resource_urn}
|
|
101
|
+
resources.append(resource_input)
|
|
102
|
+
|
|
103
|
+
# Determine mutation and operation name based on operation type
|
|
104
|
+
if operation == "add":
|
|
105
|
+
# For adding owners, we need to include ownerEntityType
|
|
106
|
+
# Determine owner entity types from URNs
|
|
107
|
+
owners = []
|
|
108
|
+
for owner_urn in owner_urns:
|
|
109
|
+
owner_entity_type = (
|
|
110
|
+
"CORP_USER" if ":corpuser:" in owner_urn.lower() else "CORP_GROUP"
|
|
111
|
+
)
|
|
112
|
+
owner_input: dict = {
|
|
113
|
+
"ownerUrn": owner_urn,
|
|
114
|
+
"ownerEntityType": owner_entity_type,
|
|
115
|
+
}
|
|
116
|
+
# Add ownership type if provided
|
|
117
|
+
if ownership_type_urn:
|
|
118
|
+
owner_input["ownershipTypeUrn"] = ownership_type_urn
|
|
119
|
+
|
|
120
|
+
owners.append(owner_input)
|
|
121
|
+
|
|
122
|
+
mutation = """
|
|
123
|
+
mutation batchAddOwners($input: BatchAddOwnersInput!) {
|
|
124
|
+
batchAddOwners(input: $input)
|
|
125
|
+
}
|
|
126
|
+
"""
|
|
127
|
+
add_input: dict = {
|
|
128
|
+
"owners": owners,
|
|
129
|
+
"resources": resources,
|
|
130
|
+
}
|
|
131
|
+
if ownership_type_urn:
|
|
132
|
+
add_input["ownershipTypeUrn"] = ownership_type_urn
|
|
133
|
+
|
|
134
|
+
variables = {"input": add_input}
|
|
135
|
+
|
|
136
|
+
operation_name = "batchAddOwners"
|
|
137
|
+
success_verb = "added"
|
|
138
|
+
failure_verb = "add"
|
|
139
|
+
else: # remove
|
|
140
|
+
mutation = """
|
|
141
|
+
mutation batchRemoveOwners($input: BatchRemoveOwnersInput!) {
|
|
142
|
+
batchRemoveOwners(input: $input)
|
|
143
|
+
}
|
|
144
|
+
"""
|
|
145
|
+
remove_input: dict = {
|
|
146
|
+
"ownerUrns": owner_urns,
|
|
147
|
+
"resources": resources,
|
|
148
|
+
}
|
|
149
|
+
if ownership_type_urn:
|
|
150
|
+
remove_input["ownershipTypeUrn"] = ownership_type_urn
|
|
151
|
+
|
|
152
|
+
variables = {"input": remove_input}
|
|
153
|
+
|
|
154
|
+
operation_name = "batchRemoveOwners"
|
|
155
|
+
success_verb = "removed"
|
|
156
|
+
failure_verb = "remove"
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
result = execute_graphql(
|
|
160
|
+
graph,
|
|
161
|
+
query=mutation,
|
|
162
|
+
variables=variables,
|
|
163
|
+
operation_name=operation_name,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
success = result.get(operation_name, False)
|
|
167
|
+
if success:
|
|
168
|
+
preposition = "to" if operation == "add" else "from"
|
|
169
|
+
return {
|
|
170
|
+
"success": True,
|
|
171
|
+
"message": f"Successfully {success_verb} {len(owner_urns)} owner(s) {preposition} {len(entity_urns)} entit(ies)",
|
|
172
|
+
}
|
|
173
|
+
else:
|
|
174
|
+
raise RuntimeError(
|
|
175
|
+
f"Failed to {failure_verb} owners - operation returned false"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
if isinstance(e, RuntimeError):
|
|
180
|
+
raise
|
|
181
|
+
raise RuntimeError(f"Error {failure_verb} owners: {str(e)}") from e
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def add_owners(
|
|
185
|
+
owner_urns: List[str],
|
|
186
|
+
entity_urns: List[str],
|
|
187
|
+
ownership_type_urn: Optional[str] = None,
|
|
188
|
+
) -> dict:
|
|
189
|
+
"""Add one or more owners to multiple DataHub entities.
|
|
190
|
+
|
|
191
|
+
This tool allows you to assign multiple entities with multiple owners in a single operation.
|
|
192
|
+
Useful for bulk ownership assignment operations like assigning data stewards, technical owners,
|
|
193
|
+
or business owners to datasets, dashboards, and other DataHub entities.
|
|
194
|
+
|
|
195
|
+
Note: Ownership in DataHub is entity-level only. For field-level metadata, use tags or glossary terms instead.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
owner_urns: List of owner URNs to add (must be CorpUser or CorpGroup URNs).
|
|
199
|
+
Examples: ["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"]
|
|
200
|
+
entity_urns: List of entity URNs to assign ownership to (e.g., dataset URNs, dashboard URNs)
|
|
201
|
+
ownership_type_urn: Optional ownership type URN to specify the type of ownership
|
|
202
|
+
(e.g., "urn:li:ownershipType:dataowner", "urn:li:ownershipType:technical_owner").
|
|
203
|
+
If not provided, ownership type will be set based on the mutation default.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Dictionary with:
|
|
207
|
+
- success: Boolean indicating if the operation succeeded
|
|
208
|
+
- message: Success or error message
|
|
209
|
+
|
|
210
|
+
Examples:
|
|
211
|
+
# Add owners to multiple datasets
|
|
212
|
+
add_owners(
|
|
213
|
+
owner_urns=["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"],
|
|
214
|
+
entity_urns=[
|
|
215
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
216
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
|
|
217
|
+
]
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Add technical owner with specific ownership type
|
|
221
|
+
add_owners(
|
|
222
|
+
owner_urns=["urn:li:corpuser:jane.smith"],
|
|
223
|
+
entity_urns=[
|
|
224
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
225
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
|
|
226
|
+
],
|
|
227
|
+
ownership_type_urn="urn:li:ownershipType:technical_owner"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Add data owner to multiple entities
|
|
231
|
+
add_owners(
|
|
232
|
+
owner_urns=["urn:li:corpuser:data.steward"],
|
|
233
|
+
entity_urns=[
|
|
234
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.sales,PROD)",
|
|
235
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.transactions,PROD)",
|
|
236
|
+
"urn:li:dashboard:(urn:li:dataPlatform:looker,sales_dashboard,PROD)"
|
|
237
|
+
],
|
|
238
|
+
ownership_type_urn="urn:li:ownershipType:dataowner"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
Example:
|
|
242
|
+
from datahub_agent_context.context import DataHubContext
|
|
243
|
+
|
|
244
|
+
with DataHubContext(client.graph):
|
|
245
|
+
result = add_owners(
|
|
246
|
+
owner_urns=["urn:li:corpuser:john.doe"],
|
|
247
|
+
entity_urns=["urn:li:dataset:(...)"]
|
|
248
|
+
)
|
|
249
|
+
"""
|
|
250
|
+
return _batch_modify_owners(owner_urns, entity_urns, ownership_type_urn, "add")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def remove_owners(
|
|
254
|
+
owner_urns: List[str],
|
|
255
|
+
entity_urns: List[str],
|
|
256
|
+
ownership_type_urn: Optional[str] = None,
|
|
257
|
+
) -> dict:
|
|
258
|
+
"""Remove one or more owners from multiple DataHub entities.
|
|
259
|
+
|
|
260
|
+
This tool allows you to unassign multiple entities from multiple owners in a single operation.
|
|
261
|
+
Useful for bulk ownership removal operations like removing owners when they change roles,
|
|
262
|
+
cleaning up stale ownership, or correcting misassigned ownership.
|
|
263
|
+
|
|
264
|
+
Note: Ownership in DataHub is entity-level only. For field-level metadata, use tags or glossary terms instead.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
owner_urns: List of owner URNs to remove (must be CorpUser or CorpGroup URNs).
|
|
268
|
+
Examples: ["urn:li:corpuser:john.doe", "urn:li:corpGroup:data-engineering"]
|
|
269
|
+
entity_urns: List of entity URNs to remove ownership from (e.g., dataset URNs, dashboard URNs)
|
|
270
|
+
ownership_type_urn: Optional ownership type URN to specify which type of ownership to remove
|
|
271
|
+
(e.g., "urn:li:ownershipType:dataowner").
|
|
272
|
+
If not provided, will remove ownership regardless of type.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Dictionary with:
|
|
276
|
+
- success: Boolean indicating if the operation succeeded
|
|
277
|
+
- message: Success or error message
|
|
278
|
+
|
|
279
|
+
Examples:
|
|
280
|
+
# Remove owners from multiple datasets
|
|
281
|
+
remove_owners(
|
|
282
|
+
owner_urns=["urn:li:corpuser:former.employee", "urn:li:corpGroup:old-team"],
|
|
283
|
+
entity_urns=[
|
|
284
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
285
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
|
|
286
|
+
]
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Remove technical owner with specific ownership type
|
|
290
|
+
remove_owners(
|
|
291
|
+
owner_urns=["urn:li:corpuser:john.doe"],
|
|
292
|
+
entity_urns=[
|
|
293
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
294
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.customers,PROD)"
|
|
295
|
+
],
|
|
296
|
+
ownership_type_urn="urn:li:ownershipType:technical_owner"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Remove temporary owner from multiple entities
|
|
300
|
+
remove_owners(
|
|
301
|
+
owner_urns=["urn:li:corpuser:temp.owner"],
|
|
302
|
+
entity_urns=[
|
|
303
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.stable_table,PROD)",
|
|
304
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.users,PROD)",
|
|
305
|
+
"urn:li:dashboard:(urn:li:dataPlatform:looker,temp_dashboard,PROD)"
|
|
306
|
+
]
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
Example:
|
|
310
|
+
from datahub_agent_context.context import DataHubContext
|
|
311
|
+
|
|
312
|
+
with DataHubContext(client.graph):
|
|
313
|
+
result = remove_owners(
|
|
314
|
+
owner_urns=["urn:li:corpuser:former.employee"],
|
|
315
|
+
entity_urns=["urn:li:dataset:(...)"]
|
|
316
|
+
)
|
|
317
|
+
"""
|
|
318
|
+
return _batch_modify_owners(owner_urns, entity_urns, ownership_type_urn, "remove")
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Tools for getting dataset queries."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import logging
|
|
5
|
+
import pathlib
|
|
6
|
+
from typing import Literal, Optional
|
|
7
|
+
|
|
8
|
+
from datahub.sdk.search_client import compile_filters
|
|
9
|
+
from datahub.sdk.search_filters import FilterDsl
|
|
10
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
11
|
+
from datahub_agent_context.context import get_graph
|
|
12
|
+
from datahub_agent_context.mcp_tools.base import clean_gql_response, execute_graphql
|
|
13
|
+
from datahub_agent_context.mcp_tools.helpers import (
|
|
14
|
+
maybe_convert_to_schema_field_urn,
|
|
15
|
+
truncate_query,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Load GraphQL query
|
|
21
|
+
queries_gql = (pathlib.Path(__file__).parent / "gql/queries.gql").read_text()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _deduplicate_subjects(subjects: list[dict]) -> list[str]:
|
|
25
|
+
"""Deduplicate subjects to unique dataset URNs.
|
|
26
|
+
|
|
27
|
+
The "subjects" field returns every dataset and schema field associated with the query.
|
|
28
|
+
While this is useful for our backend to have, it's not useful here because
|
|
29
|
+
we can just look at the query directly. So we'll narrow it down to the unique
|
|
30
|
+
list of dataset urns.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
subjects: List of subject dicts with dataset/schemaField info
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of unique dataset URNs
|
|
37
|
+
"""
|
|
38
|
+
updated_subjects: OrderedSet[str] = OrderedSet()
|
|
39
|
+
for subject in subjects:
|
|
40
|
+
with contextlib.suppress(KeyError):
|
|
41
|
+
updated_subjects.add(subject["dataset"]["urn"])
|
|
42
|
+
return list(updated_subjects)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_dataset_queries(
|
|
46
|
+
urn: str,
|
|
47
|
+
column: Optional[str] = None,
|
|
48
|
+
source: Optional[Literal["MANUAL", "SYSTEM"]] = None,
|
|
49
|
+
start: int = 0,
|
|
50
|
+
count: int = 10,
|
|
51
|
+
) -> dict:
|
|
52
|
+
"""Get SQL queries associated with a dataset or column to understand usage patterns.
|
|
53
|
+
|
|
54
|
+
This tool retrieves actual SQL queries that reference a specific dataset or column.
|
|
55
|
+
Useful for understanding how data is used, common JOIN patterns, typical filters,
|
|
56
|
+
and aggregation logic.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
urn: Dataset URN
|
|
60
|
+
column: Optional column name to filter queries
|
|
61
|
+
source: Filter by query origin:
|
|
62
|
+
- "MANUAL": Queries written by users in query editors (real SQL patterns)
|
|
63
|
+
- "SYSTEM": Queries extracted from BI tools/dashboards (production usage)
|
|
64
|
+
- None: Return both types (default)
|
|
65
|
+
start: Starting offset for pagination (default: 0)
|
|
66
|
+
count: Number of queries to return (default: 10)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Dictionary with:
|
|
70
|
+
- total: Total number of queries matching criteria
|
|
71
|
+
- start: Starting offset
|
|
72
|
+
- count: Number of results returned
|
|
73
|
+
- queries: Array of query objects with:
|
|
74
|
+
- urn: Query identifier
|
|
75
|
+
- properties.statement.value: The actual SQL text
|
|
76
|
+
- properties.statement.language: Query language (SQL, etc.)
|
|
77
|
+
- properties.source: MANUAL or SYSTEM
|
|
78
|
+
- properties.name: Optional query name
|
|
79
|
+
- platform: Source platform
|
|
80
|
+
- subjects: Referenced datasets/columns (deduplicated to dataset URNs)
|
|
81
|
+
|
|
82
|
+
COMMON USE CASES:
|
|
83
|
+
|
|
84
|
+
1. SQL Generation - Learn real query patterns:
|
|
85
|
+
get_dataset_queries(graph, urn, source="MANUAL", count=5-10)
|
|
86
|
+
→ See how users actually write SQL against this table
|
|
87
|
+
→ Discover common JOINs, aggregations, filters
|
|
88
|
+
→ Match organizational SQL conventions and patterns
|
|
89
|
+
|
|
90
|
+
2. Production usage analysis:
|
|
91
|
+
get_dataset_queries(graph, urn, source="SYSTEM", count=20)
|
|
92
|
+
→ See how dashboards and reports query this data
|
|
93
|
+
→ Understand which queries run in production
|
|
94
|
+
→ Identify critical query patterns
|
|
95
|
+
|
|
96
|
+
3. Column usage patterns:
|
|
97
|
+
get_dataset_queries(graph, urn, column="customer_id", source="MANUAL", count=5)
|
|
98
|
+
→ See how a specific column is used in queries
|
|
99
|
+
→ Learn filtering and grouping patterns for that column
|
|
100
|
+
→ Discover relationships via JOIN patterns
|
|
101
|
+
|
|
102
|
+
4. General usage exploration:
|
|
103
|
+
get_dataset_queries(graph, urn, count=10)
|
|
104
|
+
→ Get mix of manual and system queries
|
|
105
|
+
→ Understand overall table usage
|
|
106
|
+
|
|
107
|
+
EXAMPLES:
|
|
108
|
+
|
|
109
|
+
- Get manual queries for SQL generation:
|
|
110
|
+
get_dataset_queries(
|
|
111
|
+
urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,prod.sales.orders,PROD)",
|
|
112
|
+
source="MANUAL",
|
|
113
|
+
count=10
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
- Get dashboard queries (production usage):
|
|
117
|
+
get_dataset_queries(
|
|
118
|
+
urn="urn:li:dataset:(...)",
|
|
119
|
+
source="SYSTEM",
|
|
120
|
+
count=20
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
- Column-specific query patterns:
|
|
124
|
+
get_dataset_queries(
|
|
125
|
+
urn="urn:li:dataset:(...)",
|
|
126
|
+
column="created_at",
|
|
127
|
+
source="MANUAL",
|
|
128
|
+
count=5
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
from datahub_agent_context.context import DataHubContext
|
|
133
|
+
|
|
134
|
+
with DataHubContext(client.graph):
|
|
135
|
+
result = get_dataset_queries(urn="urn:li:dataset:(...)", source="MANUAL")
|
|
136
|
+
|
|
137
|
+
ANALYZING RETRIEVED QUERIES:
|
|
138
|
+
Once you retrieve queries, examine the SQL statements to identify:
|
|
139
|
+
- JOIN patterns: Which tables are joined? On what keys?
|
|
140
|
+
- Aggregations: Common SUM, COUNT, AVG, GROUP BY patterns
|
|
141
|
+
- Filters: Typical WHERE clauses, date range logic
|
|
142
|
+
- Column usage: Which columns appear frequently vs rarely
|
|
143
|
+
- CTEs and subqueries: Complex query structures
|
|
144
|
+
|
|
145
|
+
BEST PRACTICES:
|
|
146
|
+
- For SQL generation: Use source="MANUAL" (count=5-10) to see real user patterns
|
|
147
|
+
- For production analysis: Use source="SYSTEM" to see dashboard/report queries
|
|
148
|
+
- Start with moderate count (5-10) to avoid overwhelming context
|
|
149
|
+
- If no queries found (total=0), proceed without query examples - not all tables have queries
|
|
150
|
+
- Parse the SQL statements yourself to find patterns - they are not full-text searchable
|
|
151
|
+
"""
|
|
152
|
+
graph = get_graph()
|
|
153
|
+
urn = maybe_convert_to_schema_field_urn(urn, column)
|
|
154
|
+
|
|
155
|
+
entities_filter = FilterDsl.custom_filter(
|
|
156
|
+
field="entities", condition="EQUAL", values=[urn]
|
|
157
|
+
)
|
|
158
|
+
_, compiled_filters = compile_filters(entities_filter)
|
|
159
|
+
|
|
160
|
+
# Set up variables for the query
|
|
161
|
+
variables = {
|
|
162
|
+
"input": {
|
|
163
|
+
"start": start,
|
|
164
|
+
"count": count,
|
|
165
|
+
"orFilters": compiled_filters,
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Add optional source filter
|
|
170
|
+
if source is not None:
|
|
171
|
+
variables["input"]["source"] = source
|
|
172
|
+
|
|
173
|
+
# Execute the GraphQL query
|
|
174
|
+
result = execute_graphql(
|
|
175
|
+
graph,
|
|
176
|
+
query=queries_gql,
|
|
177
|
+
variables=variables,
|
|
178
|
+
operation_name="listQueries",
|
|
179
|
+
)["listQueries"]
|
|
180
|
+
|
|
181
|
+
for query in result["queries"]:
|
|
182
|
+
if query.get("subjects"):
|
|
183
|
+
query["subjects"] = _deduplicate_subjects(query["subjects"])
|
|
184
|
+
|
|
185
|
+
# Truncate long SQL queries to prevent context window issues
|
|
186
|
+
if queryProperties := query.get("properties"):
|
|
187
|
+
queryProperties["statement"]["value"] = truncate_query(
|
|
188
|
+
queryProperties["statement"]["value"]
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return clean_gql_response(result)
|