datahub-agent-context 1.3.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datahub_agent_context/__init__.py +25 -0
- datahub_agent_context/_version.py +16 -0
- datahub_agent_context/context.py +97 -0
- datahub_agent_context/langchain_tools/__init__.py +8 -0
- datahub_agent_context/langchain_tools/builder.py +127 -0
- datahub_agent_context/mcp_tools/__init__.py +46 -0
- datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
- datahub_agent_context/mcp_tools/base.py +325 -0
- datahub_agent_context/mcp_tools/descriptions.py +299 -0
- datahub_agent_context/mcp_tools/documents.py +473 -0
- datahub_agent_context/mcp_tools/domains.py +246 -0
- datahub_agent_context/mcp_tools/entities.py +349 -0
- datahub_agent_context/mcp_tools/get_me.py +99 -0
- datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
- datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
- datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
- datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
- datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
- datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
- datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
- datahub_agent_context/mcp_tools/gql/search.gql +242 -0
- datahub_agent_context/mcp_tools/helpers.py +448 -0
- datahub_agent_context/mcp_tools/lineage.py +698 -0
- datahub_agent_context/mcp_tools/owners.py +318 -0
- datahub_agent_context/mcp_tools/queries.py +191 -0
- datahub_agent_context/mcp_tools/search.py +239 -0
- datahub_agent_context/mcp_tools/structured_properties.py +447 -0
- datahub_agent_context/mcp_tools/tags.py +296 -0
- datahub_agent_context/mcp_tools/terms.py +295 -0
- datahub_agent_context/py.typed +2 -0
- datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
- datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
- datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
- datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
"""Tools for getting lineage information."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import pathlib
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from datahub.errors import ItemNotFoundError
|
|
10
|
+
from datahub.sdk.search_client import compile_filters
|
|
11
|
+
from datahub.sdk.search_filters import Filter, FilterDsl, load_filters
|
|
12
|
+
from datahub_agent_context.context import get_graph
|
|
13
|
+
from datahub_agent_context.mcp_tools.base import clean_gql_response, execute_graphql
|
|
14
|
+
from datahub_agent_context.mcp_tools.helpers import (
|
|
15
|
+
_extract_lineage_columns_from_paths,
|
|
16
|
+
_select_results_within_budget,
|
|
17
|
+
clean_get_entities_response,
|
|
18
|
+
inject_urls_for_urns,
|
|
19
|
+
maybe_convert_to_schema_field_urn,
|
|
20
|
+
truncate_descriptions,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Load GraphQL query
|
|
26
|
+
entity_details_fragment_gql = (
|
|
27
|
+
pathlib.Path(__file__).parent / "gql/entity_details.gql"
|
|
28
|
+
).read_text()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AssetLineageDirective(BaseModel):
|
|
32
|
+
"""Configuration for lineage query."""
|
|
33
|
+
|
|
34
|
+
urn: str
|
|
35
|
+
upstream: bool
|
|
36
|
+
downstream: bool
|
|
37
|
+
max_hops: int
|
|
38
|
+
extra_filters: Optional[Filter]
|
|
39
|
+
max_results: int
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class AssetLineageAPI:
|
|
43
|
+
"""API for querying asset lineage."""
|
|
44
|
+
|
|
45
|
+
def __init__(self) -> None:
|
|
46
|
+
"""Initialize lineage API."""
|
|
47
|
+
self.graph = get_graph()
|
|
48
|
+
|
|
49
|
+
def get_degree_filter(self, max_hops: int) -> Filter:
|
|
50
|
+
"""Get filter for lineage degree (hops).
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
max_hops: Maximum number of hops to search for lineage
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Filter for degree field
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ValueError: If max_hops is invalid
|
|
60
|
+
"""
|
|
61
|
+
if max_hops == 1 or max_hops == 2:
|
|
62
|
+
return FilterDsl.custom_filter(
|
|
63
|
+
field="degree",
|
|
64
|
+
condition="EQUAL",
|
|
65
|
+
values=[str(i) for i in range(1, max_hops + 1)],
|
|
66
|
+
)
|
|
67
|
+
elif max_hops >= 3:
|
|
68
|
+
return FilterDsl.custom_filter(
|
|
69
|
+
field="degree",
|
|
70
|
+
condition="EQUAL",
|
|
71
|
+
values=["1", "2", "3+"],
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError(f"Invalid number of hops: {max_hops}")
|
|
75
|
+
|
|
76
|
+
def get_lineage(
|
|
77
|
+
self,
|
|
78
|
+
asset_lineage_directive: AssetLineageDirective,
|
|
79
|
+
query: Optional[str] = None,
|
|
80
|
+
) -> Dict[str, Any]:
|
|
81
|
+
"""Get lineage for an asset.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
asset_lineage_directive: Lineage query configuration
|
|
85
|
+
query: Optional search query to filter lineage results
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dictionary with upstreams and/or downstreams fields
|
|
89
|
+
"""
|
|
90
|
+
result: Dict[str, Any] = {}
|
|
91
|
+
|
|
92
|
+
filter = self.get_degree_filter(asset_lineage_directive.max_hops)
|
|
93
|
+
if asset_lineage_directive.extra_filters:
|
|
94
|
+
filter = FilterDsl.and_(filter, asset_lineage_directive.extra_filters)
|
|
95
|
+
types, compiled_filters = compile_filters(filter)
|
|
96
|
+
variables = {
|
|
97
|
+
"urn": asset_lineage_directive.urn,
|
|
98
|
+
"query": query or "*",
|
|
99
|
+
"start": 0,
|
|
100
|
+
"count": asset_lineage_directive.max_results,
|
|
101
|
+
"types": types,
|
|
102
|
+
"orFilters": compiled_filters,
|
|
103
|
+
"searchFlags": {"skipHighlighting": True, "maxAggValues": 3},
|
|
104
|
+
}
|
|
105
|
+
if asset_lineage_directive.upstream:
|
|
106
|
+
result["upstreams"] = clean_gql_response(
|
|
107
|
+
execute_graphql(
|
|
108
|
+
self.graph,
|
|
109
|
+
query=entity_details_fragment_gql,
|
|
110
|
+
variables={
|
|
111
|
+
"input": {
|
|
112
|
+
**variables,
|
|
113
|
+
"direction": "UPSTREAM",
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
operation_name="GetEntityLineage",
|
|
117
|
+
)["searchAcrossLineage"]
|
|
118
|
+
)
|
|
119
|
+
if asset_lineage_directive.downstream:
|
|
120
|
+
result["downstreams"] = clean_gql_response(
|
|
121
|
+
execute_graphql(
|
|
122
|
+
self.graph,
|
|
123
|
+
query=entity_details_fragment_gql,
|
|
124
|
+
variables={
|
|
125
|
+
"input": {
|
|
126
|
+
**variables,
|
|
127
|
+
"direction": "DOWNSTREAM",
|
|
128
|
+
}
|
|
129
|
+
},
|
|
130
|
+
operation_name="GetEntityLineage",
|
|
131
|
+
)["searchAcrossLineage"]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return result
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_lineage(
|
|
138
|
+
urn: str,
|
|
139
|
+
column: Optional[str] = None,
|
|
140
|
+
query: Optional[str] = None,
|
|
141
|
+
filters: Optional[Filter | str] = None,
|
|
142
|
+
upstream: bool = True,
|
|
143
|
+
max_hops: int = 1,
|
|
144
|
+
max_results: int = 30,
|
|
145
|
+
offset: int = 0,
|
|
146
|
+
) -> dict:
|
|
147
|
+
"""Get upstream or downstream lineage for any entity.
|
|
148
|
+
|
|
149
|
+
Set upstream to True for upstream lineage, False for downstream lineage.
|
|
150
|
+
Set `column: null` to get lineage for entire dataset or for entity type other than dataset.
|
|
151
|
+
Setting max_hops to 3 is equivalent to unlimited hops.
|
|
152
|
+
Usage and format of filters is same as that in search tool.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
urn: Entity URN
|
|
156
|
+
column: Optional column name for column-level lineage
|
|
157
|
+
query: Optional search query to filter lineage results
|
|
158
|
+
filters: Optional filters to apply
|
|
159
|
+
upstream: True for upstream, False for downstream
|
|
160
|
+
max_hops: Maximum number of hops (1-3+)
|
|
161
|
+
max_results: Maximum number of results to return
|
|
162
|
+
offset: Pagination offset
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dictionary with upstreams or downstreams field containing:
|
|
166
|
+
- searchResults: List of lineage entities
|
|
167
|
+
- facets: Aggregations
|
|
168
|
+
- start: Starting offset
|
|
169
|
+
- count: Number of results
|
|
170
|
+
- total: Total number of entities
|
|
171
|
+
- offset: Applied offset
|
|
172
|
+
- returned: Number of entities returned
|
|
173
|
+
- hasMore: Whether more results available
|
|
174
|
+
- metadata: Additional metadata for column-level lineage
|
|
175
|
+
|
|
176
|
+
PAGINATION:
|
|
177
|
+
Use offset to paginate through large lineage graphs:
|
|
178
|
+
- offset=0, max_results=30 → first 30 entities
|
|
179
|
+
- offset=30, max_results=30 → next 30 entities
|
|
180
|
+
|
|
181
|
+
Note: Token budget constraints may return fewer entities than max_results.
|
|
182
|
+
Check the returned metadata (hasMore, returned, etc.) to understand truncation.
|
|
183
|
+
|
|
184
|
+
QUERY PARAMETER - Search within lineage results:
|
|
185
|
+
You can filter lineage results using the `query` parameter with same /q syntax as search tool:
|
|
186
|
+
- /q workspace.growthgestaofin → find tables in specific schema
|
|
187
|
+
- /q customer+transactions → find entities with both terms
|
|
188
|
+
- /q looker OR tableau → find dashboards on either platform
|
|
189
|
+
- /q * → get all lineage results (default)
|
|
190
|
+
|
|
191
|
+
Examples:
|
|
192
|
+
- Find specific table in 643 downstreams: query="workspace.growthgestaofin.qs_retention"
|
|
193
|
+
- Find Looker dashboards in lineage: query="/q tag:looker"
|
|
194
|
+
- Get all results: query="*" or omit parameter
|
|
195
|
+
|
|
196
|
+
COUNT PARAMETER - Control result size:
|
|
197
|
+
- Default: 30 results
|
|
198
|
+
- For aggregation: count=30 is sufficient (facets computed on ALL items server-side)
|
|
199
|
+
- For finding specific item: Increase count or use query to filter
|
|
200
|
+
- Example: count=100 for larger result sets
|
|
201
|
+
|
|
202
|
+
WHEN TO USE QUERY vs COUNT:
|
|
203
|
+
- User asks "is X affected?" → Use query to filter for X specifically
|
|
204
|
+
- Large lineage (>30 items) → Keep count=30, use facets for aggregation
|
|
205
|
+
- Need complete list → Increase count only if total ≤100
|
|
206
|
+
|
|
207
|
+
Example:
|
|
208
|
+
from datahub_agent_context.context import DataHubContext
|
|
209
|
+
|
|
210
|
+
with DataHubContext(client.graph):
|
|
211
|
+
result = get_lineage(urn="urn:li:dataset:(...)", upstream=True)
|
|
212
|
+
"""
|
|
213
|
+
graph = get_graph()
|
|
214
|
+
# Normalize column parameter: Some LLMs pass the string "null" instead of JSON null.
|
|
215
|
+
# Note: This means columns literally named "null" cannot be queried.
|
|
216
|
+
if column == "null" or column == "":
|
|
217
|
+
column = None
|
|
218
|
+
|
|
219
|
+
# Parse filters if provided as string
|
|
220
|
+
if isinstance(filters, str):
|
|
221
|
+
filters = load_filters(filters)
|
|
222
|
+
|
|
223
|
+
lineage_api = AssetLineageAPI()
|
|
224
|
+
|
|
225
|
+
urn = maybe_convert_to_schema_field_urn(urn, column)
|
|
226
|
+
asset_lineage_directive = AssetLineageDirective(
|
|
227
|
+
urn=urn,
|
|
228
|
+
upstream=upstream,
|
|
229
|
+
downstream=not upstream,
|
|
230
|
+
max_hops=max_hops,
|
|
231
|
+
extra_filters=filters,
|
|
232
|
+
max_results=max_results,
|
|
233
|
+
)
|
|
234
|
+
lineage = lineage_api.get_lineage(asset_lineage_directive, query=query)
|
|
235
|
+
inject_urls_for_urns(graph, lineage, ["*.searchResults[].entity"])
|
|
236
|
+
truncate_descriptions(lineage)
|
|
237
|
+
|
|
238
|
+
# Track if this is column-level lineage for metadata
|
|
239
|
+
is_column_level_lineage = column is not None
|
|
240
|
+
|
|
241
|
+
# Apply offset, entity-level truncation, and cleaning to upstreams/downstreams
|
|
242
|
+
for direction in ["upstreams", "downstreams"]:
|
|
243
|
+
if direction_results := lineage.get(direction):
|
|
244
|
+
if search_results := direction_results.get("searchResults"):
|
|
245
|
+
# Extract lineageColumns from paths for column-level lineage
|
|
246
|
+
search_results = _extract_lineage_columns_from_paths(search_results)
|
|
247
|
+
direction_results["searchResults"] = search_results
|
|
248
|
+
|
|
249
|
+
total_available = len(search_results)
|
|
250
|
+
|
|
251
|
+
# Apply offset (skip first N entities)
|
|
252
|
+
if offset >= total_available:
|
|
253
|
+
direction_results["searchResults"] = []
|
|
254
|
+
direction_results["offset"] = offset
|
|
255
|
+
direction_results["returned"] = 0
|
|
256
|
+
direction_results["hasMore"] = False
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
# Skip offset and apply token budget using generic helper
|
|
260
|
+
results_after_offset = search_results[offset:]
|
|
261
|
+
|
|
262
|
+
# Lambda to clean entity in place and return it for token counting
|
|
263
|
+
def get_cleaned_entity(result_item: dict) -> dict:
|
|
264
|
+
entity = result_item.get("entity", {})
|
|
265
|
+
cleaned = clean_get_entities_response(entity)
|
|
266
|
+
result_item["entity"] = cleaned # Mutate in place
|
|
267
|
+
return cleaned # Return for token counting
|
|
268
|
+
|
|
269
|
+
# Get results within budget (entities cleaned in place, degree preserved)
|
|
270
|
+
selected_results = list(
|
|
271
|
+
_select_results_within_budget(
|
|
272
|
+
results=iter(results_after_offset),
|
|
273
|
+
fetch_entity=get_cleaned_entity,
|
|
274
|
+
max_results=max_results,
|
|
275
|
+
)
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Update results and add metadata
|
|
279
|
+
direction_results["searchResults"] = selected_results
|
|
280
|
+
direction_results["offset"] = offset
|
|
281
|
+
direction_results["returned"] = len(selected_results)
|
|
282
|
+
direction_results["hasMore"] = (
|
|
283
|
+
offset + len(selected_results)
|
|
284
|
+
) < total_available
|
|
285
|
+
|
|
286
|
+
if len(selected_results) < len(results_after_offset):
|
|
287
|
+
direction_results["truncatedDueToTokenBudget"] = True
|
|
288
|
+
|
|
289
|
+
logger.info(
|
|
290
|
+
f"get_lineage {direction}: Returned {len(selected_results)}/{total_available} entities "
|
|
291
|
+
f"(offset={offset}, hasMore={direction_results['hasMore']})"
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Add metadata for column-level lineage responses
|
|
295
|
+
if is_column_level_lineage:
|
|
296
|
+
lineage["metadata"] = {
|
|
297
|
+
"queryType": "column-level-lineage",
|
|
298
|
+
"groupedBy": "dataset",
|
|
299
|
+
"fields": {
|
|
300
|
+
"lineageColumns": {
|
|
301
|
+
"description": "Columns in each dataset that have a lineage relationship with the source column",
|
|
302
|
+
"semantics": {
|
|
303
|
+
"downstream": "Columns derived from the source column",
|
|
304
|
+
"upstream": "Columns that the source column depends on",
|
|
305
|
+
},
|
|
306
|
+
}
|
|
307
|
+
},
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return lineage
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _find_result_with_target_urn(
|
|
314
|
+
search_results: List[dict],
|
|
315
|
+
target_urn: str,
|
|
316
|
+
is_column_level: bool,
|
|
317
|
+
) -> Optional[dict]:
|
|
318
|
+
"""Find the search result that contains the target URN.
|
|
319
|
+
|
|
320
|
+
For column-level lineage: Searches paths to find one ending with target column URN
|
|
321
|
+
For dataset-level lineage: Matches entity URN directly
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
search_results: List of lineage search results
|
|
325
|
+
target_urn: URN to search for (dataset or schemaField)
|
|
326
|
+
is_column_level: Whether this is column-level lineage
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
The search result containing the target, or None if not found
|
|
330
|
+
"""
|
|
331
|
+
for result in search_results:
|
|
332
|
+
if is_column_level:
|
|
333
|
+
# Column-level: Check if any path ends with target column URN
|
|
334
|
+
paths = result.get("paths") or []
|
|
335
|
+
for path_obj in paths:
|
|
336
|
+
if not path_obj:
|
|
337
|
+
continue
|
|
338
|
+
path = path_obj.get("path") or []
|
|
339
|
+
if path and path[-1].get("urn") == target_urn:
|
|
340
|
+
return result
|
|
341
|
+
else:
|
|
342
|
+
# Dataset-level: Match entity URN directly
|
|
343
|
+
if result.get("entity", {}).get("urn") == target_urn:
|
|
344
|
+
return result
|
|
345
|
+
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _find_upstream_lineage_path(
|
|
350
|
+
query_urn: str,
|
|
351
|
+
search_for_urn: str,
|
|
352
|
+
source_urn: str,
|
|
353
|
+
target_urn: str,
|
|
354
|
+
source_column: Optional[str],
|
|
355
|
+
target_column: Optional[str],
|
|
356
|
+
semantic_direction: Literal["upstream", "downstream"],
|
|
357
|
+
) -> dict:
|
|
358
|
+
"""Internal helper to find upstream lineage path.
|
|
359
|
+
|
|
360
|
+
Always queries upstream lineage (more bounded than downstream).
|
|
361
|
+
|
|
362
|
+
KEY INSIGHT: Lineage is isotropic (symmetric):
|
|
363
|
+
- If B is in A's downstream, then A is in B's upstream
|
|
364
|
+
- The path is the same, just viewed from different ends
|
|
365
|
+
- Therefore, we can always query upstream and reverse the path for downstream queries
|
|
366
|
+
|
|
367
|
+
This optimization significantly reduces response sizes:
|
|
368
|
+
- Upstream: Typically 10-100 results (bounded by data sources)
|
|
369
|
+
- Downstream: Can be 1000s of results (unlimited consumers)
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
query_urn: URN to query lineage from (could be source or target depending on semantic direction)
|
|
373
|
+
search_for_urn: URN to search for in results
|
|
374
|
+
source_urn: Original source URN (for response metadata)
|
|
375
|
+
target_urn: Original target URN (for response metadata)
|
|
376
|
+
source_column: Original source column (for response metadata)
|
|
377
|
+
target_column: Original target column (for response metadata)
|
|
378
|
+
semantic_direction: User's requested direction (for metadata, not query direction)
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Dictionary with paths and metadata
|
|
382
|
+
|
|
383
|
+
Raises:
|
|
384
|
+
ItemNotFoundError: If no lineage path found
|
|
385
|
+
"""
|
|
386
|
+
graph = get_graph()
|
|
387
|
+
# Get lineage with paths using the API directly (always upstream)
|
|
388
|
+
lineage_api = AssetLineageAPI()
|
|
389
|
+
asset_lineage_directive = AssetLineageDirective(
|
|
390
|
+
urn=query_urn,
|
|
391
|
+
upstream=True, # Always upstream
|
|
392
|
+
downstream=False,
|
|
393
|
+
max_hops=10, # Higher to ensure we find target
|
|
394
|
+
extra_filters=None,
|
|
395
|
+
max_results=100, # Need enough results to find target
|
|
396
|
+
)
|
|
397
|
+
lineage = lineage_api.get_lineage(asset_lineage_directive, query="*")
|
|
398
|
+
|
|
399
|
+
# Clean up the response
|
|
400
|
+
inject_urls_for_urns(graph, lineage, ["*.searchResults[].entity"])
|
|
401
|
+
truncate_descriptions(lineage)
|
|
402
|
+
|
|
403
|
+
# Get upstream results (always querying upstream)
|
|
404
|
+
search_results = lineage.get("upstreams", {}).get("searchResults", [])
|
|
405
|
+
|
|
406
|
+
if not search_results:
|
|
407
|
+
raise ItemNotFoundError(
|
|
408
|
+
f"No lineage found from {source_urn}"
|
|
409
|
+
+ (f".{source_column}" if source_column else "")
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Find the result containing the target URN
|
|
413
|
+
target_result = _find_result_with_target_urn(
|
|
414
|
+
search_results=search_results,
|
|
415
|
+
target_urn=search_for_urn,
|
|
416
|
+
is_column_level=(target_column is not None),
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
if not target_result:
|
|
420
|
+
raise ItemNotFoundError(
|
|
421
|
+
f"No lineage path found from {source_urn}"
|
|
422
|
+
+ (f".{source_column}" if source_column else "")
|
|
423
|
+
+ f" to {target_urn}"
|
|
424
|
+
+ (f".{target_column}" if target_column else "")
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Extract paths array (with QUERY URNs as-is)
|
|
428
|
+
paths = target_result.get("paths", [])
|
|
429
|
+
if not paths:
|
|
430
|
+
raise ValueError(
|
|
431
|
+
"Target found but no path information available. "
|
|
432
|
+
"This may indicate the entities are directly connected without intermediate steps."
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Clean the paths response
|
|
436
|
+
cleaned_paths = clean_gql_response(paths)
|
|
437
|
+
|
|
438
|
+
# Check if any paths contain QUERY entities (with safe null handling)
|
|
439
|
+
has_queries = any(
|
|
440
|
+
any(
|
|
441
|
+
entity.get("type") == "QUERY"
|
|
442
|
+
for entity in (path_obj.get("path") or [])
|
|
443
|
+
if entity # Skip None entities
|
|
444
|
+
)
|
|
445
|
+
for path_obj in cleaned_paths
|
|
446
|
+
if path_obj # Skip None path objects
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Build metadata
|
|
450
|
+
paths_metadata: dict[str, str] = {
|
|
451
|
+
"description": "Array of lineage paths showing transformation chains from source to target",
|
|
452
|
+
"structure": "Each path contains alternating entities (SCHEMA_FIELD or DATASET) and optional transformation QUERY entities",
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
# Add query enrichment note only when queries are present
|
|
456
|
+
if has_queries:
|
|
457
|
+
paths_metadata["queryEntities"] = (
|
|
458
|
+
"QUERY entities are returned as URNs only. "
|
|
459
|
+
"Use get_entities(query_urn) to fetch SQL statement and other query details."
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
metadata = {
|
|
463
|
+
"queryType": "lineage-path-trace",
|
|
464
|
+
"direction": semantic_direction,
|
|
465
|
+
"pathType": "column-level" if source_column else "dataset-level",
|
|
466
|
+
"fields": {"paths": paths_metadata},
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
# Build response with metadata
|
|
470
|
+
return {
|
|
471
|
+
"metadata": metadata,
|
|
472
|
+
"source": {
|
|
473
|
+
"urn": source_urn,
|
|
474
|
+
**({"column": source_column} if source_column else {}),
|
|
475
|
+
},
|
|
476
|
+
"target": {
|
|
477
|
+
"urn": target_urn,
|
|
478
|
+
**({"column": target_column} if target_column else {}),
|
|
479
|
+
},
|
|
480
|
+
"pathCount": len(cleaned_paths),
|
|
481
|
+
"paths": cleaned_paths,
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _find_lineage_path(
|
|
486
|
+
query_urn: str,
|
|
487
|
+
target_full_urn: str,
|
|
488
|
+
source_urn: str,
|
|
489
|
+
target_urn: str,
|
|
490
|
+
source_column: Optional[str],
|
|
491
|
+
target_column: Optional[str],
|
|
492
|
+
direction: Literal["upstream", "downstream"],
|
|
493
|
+
) -> dict:
|
|
494
|
+
"""Internal helper to find lineage path in a specific direction.
|
|
495
|
+
|
|
496
|
+
Always queries upstream internally (more efficient), but maintains the
|
|
497
|
+
semantic direction in the API. For downstream queries, swaps source/target
|
|
498
|
+
and reverses the path.
|
|
499
|
+
|
|
500
|
+
Separated from main function to support auto-discovery logic.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
query_urn: URN to query lineage from
|
|
504
|
+
target_full_urn: Target URN to find
|
|
505
|
+
source_urn: Source dataset URN
|
|
506
|
+
target_urn: Target dataset URN
|
|
507
|
+
source_column: Source column name
|
|
508
|
+
target_column: Target column name
|
|
509
|
+
direction: Direction to search ("upstream" or "downstream")
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Dictionary with paths and metadata
|
|
513
|
+
|
|
514
|
+
Raises:
|
|
515
|
+
ItemNotFoundError: If no lineage path found
|
|
516
|
+
"""
|
|
517
|
+
if direction == "downstream":
|
|
518
|
+
# User semantic: source flows TO target (source → target)
|
|
519
|
+
# Implementation: Query target's upstream to find source
|
|
520
|
+
# Then reverse the path to show source → target
|
|
521
|
+
result = _find_upstream_lineage_path(
|
|
522
|
+
query_urn=target_full_urn, # Query from target
|
|
523
|
+
search_for_urn=query_urn, # Search for source
|
|
524
|
+
source_urn=source_urn,
|
|
525
|
+
target_urn=target_urn,
|
|
526
|
+
source_column=source_column,
|
|
527
|
+
target_column=target_column,
|
|
528
|
+
semantic_direction=direction,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Reverse paths to show source → target order
|
|
532
|
+
# Structure: result["paths"] = [{"path": [entity1, entity2, ...]}, ...]
|
|
533
|
+
# Each path is an array of entities (SCHEMA_FIELD/DATASET/QUERY)
|
|
534
|
+
# Upstream query returns: [target, query, intermediate, query, source]
|
|
535
|
+
# We reverse to: [source, query, intermediate, query, target]
|
|
536
|
+
for path_obj in result.get("paths", []):
|
|
537
|
+
if path_obj and "path" in path_obj:
|
|
538
|
+
path_obj["path"] = list(reversed(path_obj["path"]))
|
|
539
|
+
|
|
540
|
+
return result
|
|
541
|
+
else:
|
|
542
|
+
# User semantic: source depends ON target (target → source)
|
|
543
|
+
# Implementation: Query source's upstream to find target
|
|
544
|
+
# Path is already in correct order (target → source)
|
|
545
|
+
return _find_upstream_lineage_path(
|
|
546
|
+
query_urn=query_urn, # Query from source
|
|
547
|
+
search_for_urn=target_full_urn, # Search for target
|
|
548
|
+
source_urn=source_urn,
|
|
549
|
+
target_urn=target_urn,
|
|
550
|
+
source_column=source_column,
|
|
551
|
+
target_column=target_column,
|
|
552
|
+
semantic_direction=direction,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def get_lineage_paths_between(
|
|
557
|
+
source_urn: str,
|
|
558
|
+
target_urn: str,
|
|
559
|
+
source_column: Optional[str] = None,
|
|
560
|
+
target_column: Optional[str] = None,
|
|
561
|
+
direction: Optional[Literal["upstream", "downstream"]] = None,
|
|
562
|
+
) -> dict:
|
|
563
|
+
"""Get detailed lineage path(s) between two specific entities or columns.
|
|
564
|
+
|
|
565
|
+
Returns the paths array from searchAcrossLineage, showing the exact transformation
|
|
566
|
+
chain(s) including intermediate entities, columns, and transformation query URNs.
|
|
567
|
+
|
|
568
|
+
Unlike get_lineage() which returns all lineage targets with compact lineageColumns,
|
|
569
|
+
this tool focuses on ONE specific target and returns detailed path information.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
source_urn: URN of the source dataset
|
|
573
|
+
target_urn: URN of the target dataset
|
|
574
|
+
source_column: Optional column name in source dataset
|
|
575
|
+
target_column: Optional column name in target dataset (required if source_column provided)
|
|
576
|
+
direction: Optional direction to search. If None (default), automatically discovers
|
|
577
|
+
the path by trying downstream first, then upstream. Specify "downstream" or
|
|
578
|
+
"upstream" explicitly for better performance if you know the direction.
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
Dictionary with:
|
|
582
|
+
- source: Source entity/column info
|
|
583
|
+
- target: Target entity/column info
|
|
584
|
+
- paths: Array of path objects from GraphQL (with QUERY URNs)
|
|
585
|
+
- pathCount: Number of paths found
|
|
586
|
+
- metadata: Query metadata including direction and path type
|
|
587
|
+
|
|
588
|
+
Examples:
|
|
589
|
+
# Column-level paths
|
|
590
|
+
paths_result = get_lineage_paths_between(
|
|
591
|
+
source_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.base_table,PROD)",
|
|
592
|
+
target_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.final_table,PROD)",
|
|
593
|
+
source_column="user_id",
|
|
594
|
+
target_column="customer_id"
|
|
595
|
+
)
|
|
596
|
+
# Returns paths with QUERY URNs showing transformation chain
|
|
597
|
+
|
|
598
|
+
# Fetch SQL for specific query of interest
|
|
599
|
+
query_details = get_entities(paths_result["paths"][0]["path"][1]["urn"])
|
|
600
|
+
|
|
601
|
+
# Dataset-level paths (auto-discover direction)
|
|
602
|
+
get_lineage_paths_between(
|
|
603
|
+
source_urn="urn:li:dataset:(...):base_table",
|
|
604
|
+
target_urn="urn:li:dataset:(...):final_table"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Explicit direction for better performance
|
|
608
|
+
get_lineage_paths_between(
|
|
609
|
+
source_urn="urn:li:dataset:(...):base_table",
|
|
610
|
+
target_urn="urn:li:dataset:(...):final_table",
|
|
611
|
+
direction="downstream"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
Example:
|
|
615
|
+
from datahub_agent_context.context import DataHubContext
|
|
616
|
+
|
|
617
|
+
with DataHubContext(client.graph):
|
|
618
|
+
result = get_lineage_paths_between(
|
|
619
|
+
source_urn="urn:li:dataset:(...)",
|
|
620
|
+
target_urn="urn:li:dataset:(...)"
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
Raises:
|
|
624
|
+
ValueError: If column parameters are mismatched or invalid
|
|
625
|
+
ItemNotFoundError: If no lineage path found
|
|
626
|
+
"""
|
|
627
|
+
# Normalize column parameters
|
|
628
|
+
if source_column == "null" or source_column == "":
|
|
629
|
+
source_column = None
|
|
630
|
+
if target_column == "null" or target_column == "":
|
|
631
|
+
target_column = None
|
|
632
|
+
|
|
633
|
+
# Validate: if either column is specified, must be column-level lineage
|
|
634
|
+
if (source_column is None) != (target_column is None):
|
|
635
|
+
raise ValueError(
|
|
636
|
+
"Both source_column and target_column must be provided for column-level lineage, "
|
|
637
|
+
"or both must be None for dataset-level lineage"
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
# Convert to schema field URN if column specified
|
|
641
|
+
query_urn = maybe_convert_to_schema_field_urn(source_urn, source_column)
|
|
642
|
+
target_full_urn = maybe_convert_to_schema_field_urn(target_urn, target_column)
|
|
643
|
+
|
|
644
|
+
# Auto-discover direction if not specified
|
|
645
|
+
if direction is None:
|
|
646
|
+
# Try downstream first (more common use case)
|
|
647
|
+
try:
|
|
648
|
+
result = _find_lineage_path(
|
|
649
|
+
query_urn=query_urn,
|
|
650
|
+
target_full_urn=target_full_urn,
|
|
651
|
+
source_urn=source_urn,
|
|
652
|
+
target_urn=target_urn,
|
|
653
|
+
source_column=source_column,
|
|
654
|
+
target_column=target_column,
|
|
655
|
+
direction="downstream",
|
|
656
|
+
)
|
|
657
|
+
result["metadata"]["direction"] = "auto-discovered-downstream"
|
|
658
|
+
result["metadata"]["note"] = (
|
|
659
|
+
"Direction was automatically discovered. Specify direction='downstream' or 'upstream' explicitly for better performance."
|
|
660
|
+
)
|
|
661
|
+
return result
|
|
662
|
+
except ItemNotFoundError:
|
|
663
|
+
# Try upstream as fallback
|
|
664
|
+
try:
|
|
665
|
+
result = _find_lineage_path(
|
|
666
|
+
query_urn=query_urn,
|
|
667
|
+
target_full_urn=target_full_urn,
|
|
668
|
+
source_urn=source_urn,
|
|
669
|
+
target_urn=target_urn,
|
|
670
|
+
source_column=source_column,
|
|
671
|
+
target_column=target_column,
|
|
672
|
+
direction="upstream",
|
|
673
|
+
)
|
|
674
|
+
result["metadata"]["direction"] = "auto-discovered-upstream"
|
|
675
|
+
result["metadata"]["note"] = (
|
|
676
|
+
"Direction was automatically discovered. Specify direction='downstream' or 'upstream' explicitly for better performance."
|
|
677
|
+
)
|
|
678
|
+
return result
|
|
679
|
+
except ItemNotFoundError:
|
|
680
|
+
# Not found in either direction
|
|
681
|
+
raise ItemNotFoundError(
|
|
682
|
+
f"No lineage path found between {source_urn}"
|
|
683
|
+
+ (f".{source_column}" if source_column else "")
|
|
684
|
+
+ f" and {target_urn}"
|
|
685
|
+
+ (f".{target_column}" if target_column else "")
|
|
686
|
+
+ " in either upstream or downstream direction"
|
|
687
|
+
) from None
|
|
688
|
+
else:
|
|
689
|
+
# User specified direction explicitly
|
|
690
|
+
return _find_lineage_path(
|
|
691
|
+
query_urn=query_urn,
|
|
692
|
+
target_full_urn=target_full_urn,
|
|
693
|
+
source_urn=source_urn,
|
|
694
|
+
target_urn=target_urn,
|
|
695
|
+
source_column=source_column,
|
|
696
|
+
target_column=target_column,
|
|
697
|
+
direction=direction,
|
|
698
|
+
)
|