datahub-agent-context 1.3.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. datahub_agent_context/__init__.py +25 -0
  2. datahub_agent_context/_version.py +16 -0
  3. datahub_agent_context/context.py +97 -0
  4. datahub_agent_context/langchain_tools/__init__.py +8 -0
  5. datahub_agent_context/langchain_tools/builder.py +127 -0
  6. datahub_agent_context/mcp_tools/__init__.py +46 -0
  7. datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
  8. datahub_agent_context/mcp_tools/base.py +325 -0
  9. datahub_agent_context/mcp_tools/descriptions.py +299 -0
  10. datahub_agent_context/mcp_tools/documents.py +473 -0
  11. datahub_agent_context/mcp_tools/domains.py +246 -0
  12. datahub_agent_context/mcp_tools/entities.py +349 -0
  13. datahub_agent_context/mcp_tools/get_me.py +99 -0
  14. datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
  15. datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
  16. datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
  17. datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
  18. datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
  19. datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
  20. datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
  21. datahub_agent_context/mcp_tools/gql/search.gql +242 -0
  22. datahub_agent_context/mcp_tools/helpers.py +448 -0
  23. datahub_agent_context/mcp_tools/lineage.py +698 -0
  24. datahub_agent_context/mcp_tools/owners.py +318 -0
  25. datahub_agent_context/mcp_tools/queries.py +191 -0
  26. datahub_agent_context/mcp_tools/search.py +239 -0
  27. datahub_agent_context/mcp_tools/structured_properties.py +447 -0
  28. datahub_agent_context/mcp_tools/tags.py +296 -0
  29. datahub_agent_context/mcp_tools/terms.py +295 -0
  30. datahub_agent_context/py.typed +2 -0
  31. datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
  32. datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
  33. datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
  34. datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,698 @@
1
+ """Tools for getting lineage information."""
2
+
3
+ import logging
4
+ import pathlib
5
+ from typing import Any, Dict, List, Literal, Optional
6
+
7
+ from pydantic import BaseModel
8
+
9
+ from datahub.errors import ItemNotFoundError
10
+ from datahub.sdk.search_client import compile_filters
11
+ from datahub.sdk.search_filters import Filter, FilterDsl, load_filters
12
+ from datahub_agent_context.context import get_graph
13
+ from datahub_agent_context.mcp_tools.base import clean_gql_response, execute_graphql
14
+ from datahub_agent_context.mcp_tools.helpers import (
15
+ _extract_lineage_columns_from_paths,
16
+ _select_results_within_budget,
17
+ clean_get_entities_response,
18
+ inject_urls_for_urns,
19
+ maybe_convert_to_schema_field_urn,
20
+ truncate_descriptions,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Load GraphQL query
26
+ entity_details_fragment_gql = (
27
+ pathlib.Path(__file__).parent / "gql/entity_details.gql"
28
+ ).read_text()
29
+
30
+
31
+ class AssetLineageDirective(BaseModel):
32
+ """Configuration for lineage query."""
33
+
34
+ urn: str
35
+ upstream: bool
36
+ downstream: bool
37
+ max_hops: int
38
+ extra_filters: Optional[Filter]
39
+ max_results: int
40
+
41
+
42
+ class AssetLineageAPI:
43
+ """API for querying asset lineage."""
44
+
45
+ def __init__(self) -> None:
46
+ """Initialize lineage API."""
47
+ self.graph = get_graph()
48
+
49
+ def get_degree_filter(self, max_hops: int) -> Filter:
50
+ """Get filter for lineage degree (hops).
51
+
52
+ Args:
53
+ max_hops: Maximum number of hops to search for lineage
54
+
55
+ Returns:
56
+ Filter for degree field
57
+
58
+ Raises:
59
+ ValueError: If max_hops is invalid
60
+ """
61
+ if max_hops == 1 or max_hops == 2:
62
+ return FilterDsl.custom_filter(
63
+ field="degree",
64
+ condition="EQUAL",
65
+ values=[str(i) for i in range(1, max_hops + 1)],
66
+ )
67
+ elif max_hops >= 3:
68
+ return FilterDsl.custom_filter(
69
+ field="degree",
70
+ condition="EQUAL",
71
+ values=["1", "2", "3+"],
72
+ )
73
+ else:
74
+ raise ValueError(f"Invalid number of hops: {max_hops}")
75
+
76
+ def get_lineage(
77
+ self,
78
+ asset_lineage_directive: AssetLineageDirective,
79
+ query: Optional[str] = None,
80
+ ) -> Dict[str, Any]:
81
+ """Get lineage for an asset.
82
+
83
+ Args:
84
+ asset_lineage_directive: Lineage query configuration
85
+ query: Optional search query to filter lineage results
86
+
87
+ Returns:
88
+ Dictionary with upstreams and/or downstreams fields
89
+ """
90
+ result: Dict[str, Any] = {}
91
+
92
+ filter = self.get_degree_filter(asset_lineage_directive.max_hops)
93
+ if asset_lineage_directive.extra_filters:
94
+ filter = FilterDsl.and_(filter, asset_lineage_directive.extra_filters)
95
+ types, compiled_filters = compile_filters(filter)
96
+ variables = {
97
+ "urn": asset_lineage_directive.urn,
98
+ "query": query or "*",
99
+ "start": 0,
100
+ "count": asset_lineage_directive.max_results,
101
+ "types": types,
102
+ "orFilters": compiled_filters,
103
+ "searchFlags": {"skipHighlighting": True, "maxAggValues": 3},
104
+ }
105
+ if asset_lineage_directive.upstream:
106
+ result["upstreams"] = clean_gql_response(
107
+ execute_graphql(
108
+ self.graph,
109
+ query=entity_details_fragment_gql,
110
+ variables={
111
+ "input": {
112
+ **variables,
113
+ "direction": "UPSTREAM",
114
+ }
115
+ },
116
+ operation_name="GetEntityLineage",
117
+ )["searchAcrossLineage"]
118
+ )
119
+ if asset_lineage_directive.downstream:
120
+ result["downstreams"] = clean_gql_response(
121
+ execute_graphql(
122
+ self.graph,
123
+ query=entity_details_fragment_gql,
124
+ variables={
125
+ "input": {
126
+ **variables,
127
+ "direction": "DOWNSTREAM",
128
+ }
129
+ },
130
+ operation_name="GetEntityLineage",
131
+ )["searchAcrossLineage"]
132
+ )
133
+
134
+ return result
135
+
136
+
137
+ def get_lineage(
138
+ urn: str,
139
+ column: Optional[str] = None,
140
+ query: Optional[str] = None,
141
+ filters: Optional[Filter | str] = None,
142
+ upstream: bool = True,
143
+ max_hops: int = 1,
144
+ max_results: int = 30,
145
+ offset: int = 0,
146
+ ) -> dict:
147
+ """Get upstream or downstream lineage for any entity.
148
+
149
+ Set upstream to True for upstream lineage, False for downstream lineage.
150
+ Set `column: null` to get lineage for entire dataset or for entity type other than dataset.
151
+ Setting max_hops to 3 is equivalent to unlimited hops.
152
+ Usage and format of filters is same as that in search tool.
153
+
154
+ Args:
155
+ urn: Entity URN
156
+ column: Optional column name for column-level lineage
157
+ query: Optional search query to filter lineage results
158
+ filters: Optional filters to apply
159
+ upstream: True for upstream, False for downstream
160
+ max_hops: Maximum number of hops (1-3+)
161
+ max_results: Maximum number of results to return
162
+ offset: Pagination offset
163
+
164
+ Returns:
165
+ Dictionary with upstreams or downstreams field containing:
166
+ - searchResults: List of lineage entities
167
+ - facets: Aggregations
168
+ - start: Starting offset
169
+ - count: Number of results
170
+ - total: Total number of entities
171
+ - offset: Applied offset
172
+ - returned: Number of entities returned
173
+ - hasMore: Whether more results available
174
+ - metadata: Additional metadata for column-level lineage
175
+
176
+ PAGINATION:
177
+ Use offset to paginate through large lineage graphs:
178
+ - offset=0, max_results=30 → first 30 entities
179
+ - offset=30, max_results=30 → next 30 entities
180
+
181
+ Note: Token budget constraints may return fewer entities than max_results.
182
+ Check the returned metadata (hasMore, returned, etc.) to understand truncation.
183
+
184
+ QUERY PARAMETER - Search within lineage results:
185
+ You can filter lineage results using the `query` parameter with same /q syntax as search tool:
186
+ - /q workspace.growthgestaofin → find tables in specific schema
187
+ - /q customer+transactions → find entities with both terms
188
+ - /q looker OR tableau → find dashboards on either platform
189
+ - /q * → get all lineage results (default)
190
+
191
+ Examples:
192
+ - Find specific table in 643 downstreams: query="workspace.growthgestaofin.qs_retention"
193
+ - Find Looker dashboards in lineage: query="/q tag:looker"
194
+ - Get all results: query="*" or omit parameter
195
+
196
+ COUNT PARAMETER - Control result size:
197
+ - Default: 30 results
198
+ - For aggregation: count=30 is sufficient (facets computed on ALL items server-side)
199
+ - For finding specific item: Increase count or use query to filter
200
+ - Example: count=100 for larger result sets
201
+
202
+ WHEN TO USE QUERY vs COUNT:
203
+ - User asks "is X affected?" → Use query to filter for X specifically
204
+ - Large lineage (>30 items) → Keep count=30, use facets for aggregation
205
+ - Need complete list → Increase count only if total ≤100
206
+
207
+ Example:
208
+ from datahub_agent_context.context import DataHubContext
209
+
210
+ with DataHubContext(client.graph):
211
+ result = get_lineage(urn="urn:li:dataset:(...)", upstream=True)
212
+ """
213
+ graph = get_graph()
214
+ # Normalize column parameter: Some LLMs pass the string "null" instead of JSON null.
215
+ # Note: This means columns literally named "null" cannot be queried.
216
+ if column == "null" or column == "":
217
+ column = None
218
+
219
+ # Parse filters if provided as string
220
+ if isinstance(filters, str):
221
+ filters = load_filters(filters)
222
+
223
+ lineage_api = AssetLineageAPI()
224
+
225
+ urn = maybe_convert_to_schema_field_urn(urn, column)
226
+ asset_lineage_directive = AssetLineageDirective(
227
+ urn=urn,
228
+ upstream=upstream,
229
+ downstream=not upstream,
230
+ max_hops=max_hops,
231
+ extra_filters=filters,
232
+ max_results=max_results,
233
+ )
234
+ lineage = lineage_api.get_lineage(asset_lineage_directive, query=query)
235
+ inject_urls_for_urns(graph, lineage, ["*.searchResults[].entity"])
236
+ truncate_descriptions(lineage)
237
+
238
+ # Track if this is column-level lineage for metadata
239
+ is_column_level_lineage = column is not None
240
+
241
+ # Apply offset, entity-level truncation, and cleaning to upstreams/downstreams
242
+ for direction in ["upstreams", "downstreams"]:
243
+ if direction_results := lineage.get(direction):
244
+ if search_results := direction_results.get("searchResults"):
245
+ # Extract lineageColumns from paths for column-level lineage
246
+ search_results = _extract_lineage_columns_from_paths(search_results)
247
+ direction_results["searchResults"] = search_results
248
+
249
+ total_available = len(search_results)
250
+
251
+ # Apply offset (skip first N entities)
252
+ if offset >= total_available:
253
+ direction_results["searchResults"] = []
254
+ direction_results["offset"] = offset
255
+ direction_results["returned"] = 0
256
+ direction_results["hasMore"] = False
257
+ continue
258
+
259
+ # Skip offset and apply token budget using generic helper
260
+ results_after_offset = search_results[offset:]
261
+
262
+ # Lambda to clean entity in place and return it for token counting
263
+ def get_cleaned_entity(result_item: dict) -> dict:
264
+ entity = result_item.get("entity", {})
265
+ cleaned = clean_get_entities_response(entity)
266
+ result_item["entity"] = cleaned # Mutate in place
267
+ return cleaned # Return for token counting
268
+
269
+ # Get results within budget (entities cleaned in place, degree preserved)
270
+ selected_results = list(
271
+ _select_results_within_budget(
272
+ results=iter(results_after_offset),
273
+ fetch_entity=get_cleaned_entity,
274
+ max_results=max_results,
275
+ )
276
+ )
277
+
278
+ # Update results and add metadata
279
+ direction_results["searchResults"] = selected_results
280
+ direction_results["offset"] = offset
281
+ direction_results["returned"] = len(selected_results)
282
+ direction_results["hasMore"] = (
283
+ offset + len(selected_results)
284
+ ) < total_available
285
+
286
+ if len(selected_results) < len(results_after_offset):
287
+ direction_results["truncatedDueToTokenBudget"] = True
288
+
289
+ logger.info(
290
+ f"get_lineage {direction}: Returned {len(selected_results)}/{total_available} entities "
291
+ f"(offset={offset}, hasMore={direction_results['hasMore']})"
292
+ )
293
+
294
+ # Add metadata for column-level lineage responses
295
+ if is_column_level_lineage:
296
+ lineage["metadata"] = {
297
+ "queryType": "column-level-lineage",
298
+ "groupedBy": "dataset",
299
+ "fields": {
300
+ "lineageColumns": {
301
+ "description": "Columns in each dataset that have a lineage relationship with the source column",
302
+ "semantics": {
303
+ "downstream": "Columns derived from the source column",
304
+ "upstream": "Columns that the source column depends on",
305
+ },
306
+ }
307
+ },
308
+ }
309
+
310
+ return lineage
311
+
312
+
313
+ def _find_result_with_target_urn(
314
+ search_results: List[dict],
315
+ target_urn: str,
316
+ is_column_level: bool,
317
+ ) -> Optional[dict]:
318
+ """Find the search result that contains the target URN.
319
+
320
+ For column-level lineage: Searches paths to find one ending with target column URN
321
+ For dataset-level lineage: Matches entity URN directly
322
+
323
+ Args:
324
+ search_results: List of lineage search results
325
+ target_urn: URN to search for (dataset or schemaField)
326
+ is_column_level: Whether this is column-level lineage
327
+
328
+ Returns:
329
+ The search result containing the target, or None if not found
330
+ """
331
+ for result in search_results:
332
+ if is_column_level:
333
+ # Column-level: Check if any path ends with target column URN
334
+ paths = result.get("paths") or []
335
+ for path_obj in paths:
336
+ if not path_obj:
337
+ continue
338
+ path = path_obj.get("path") or []
339
+ if path and path[-1].get("urn") == target_urn:
340
+ return result
341
+ else:
342
+ # Dataset-level: Match entity URN directly
343
+ if result.get("entity", {}).get("urn") == target_urn:
344
+ return result
345
+
346
+ return None
347
+
348
+
349
+ def _find_upstream_lineage_path(
350
+ query_urn: str,
351
+ search_for_urn: str,
352
+ source_urn: str,
353
+ target_urn: str,
354
+ source_column: Optional[str],
355
+ target_column: Optional[str],
356
+ semantic_direction: Literal["upstream", "downstream"],
357
+ ) -> dict:
358
+ """Internal helper to find upstream lineage path.
359
+
360
+ Always queries upstream lineage (more bounded than downstream).
361
+
362
+ KEY INSIGHT: Lineage is isotropic (symmetric):
363
+ - If B is in A's downstream, then A is in B's upstream
364
+ - The path is the same, just viewed from different ends
365
+ - Therefore, we can always query upstream and reverse the path for downstream queries
366
+
367
+ This optimization significantly reduces response sizes:
368
+ - Upstream: Typically 10-100 results (bounded by data sources)
369
+ - Downstream: Can be 1000s of results (unlimited consumers)
370
+
371
+ Args:
372
+ query_urn: URN to query lineage from (could be source or target depending on semantic direction)
373
+ search_for_urn: URN to search for in results
374
+ source_urn: Original source URN (for response metadata)
375
+ target_urn: Original target URN (for response metadata)
376
+ source_column: Original source column (for response metadata)
377
+ target_column: Original target column (for response metadata)
378
+ semantic_direction: User's requested direction (for metadata, not query direction)
379
+
380
+ Returns:
381
+ Dictionary with paths and metadata
382
+
383
+ Raises:
384
+ ItemNotFoundError: If no lineage path found
385
+ """
386
+ graph = get_graph()
387
+ # Get lineage with paths using the API directly (always upstream)
388
+ lineage_api = AssetLineageAPI()
389
+ asset_lineage_directive = AssetLineageDirective(
390
+ urn=query_urn,
391
+ upstream=True, # Always upstream
392
+ downstream=False,
393
+ max_hops=10, # Higher to ensure we find target
394
+ extra_filters=None,
395
+ max_results=100, # Need enough results to find target
396
+ )
397
+ lineage = lineage_api.get_lineage(asset_lineage_directive, query="*")
398
+
399
+ # Clean up the response
400
+ inject_urls_for_urns(graph, lineage, ["*.searchResults[].entity"])
401
+ truncate_descriptions(lineage)
402
+
403
+ # Get upstream results (always querying upstream)
404
+ search_results = lineage.get("upstreams", {}).get("searchResults", [])
405
+
406
+ if not search_results:
407
+ raise ItemNotFoundError(
408
+ f"No lineage found from {source_urn}"
409
+ + (f".{source_column}" if source_column else "")
410
+ )
411
+
412
+ # Find the result containing the target URN
413
+ target_result = _find_result_with_target_urn(
414
+ search_results=search_results,
415
+ target_urn=search_for_urn,
416
+ is_column_level=(target_column is not None),
417
+ )
418
+
419
+ if not target_result:
420
+ raise ItemNotFoundError(
421
+ f"No lineage path found from {source_urn}"
422
+ + (f".{source_column}" if source_column else "")
423
+ + f" to {target_urn}"
424
+ + (f".{target_column}" if target_column else "")
425
+ )
426
+
427
+ # Extract paths array (with QUERY URNs as-is)
428
+ paths = target_result.get("paths", [])
429
+ if not paths:
430
+ raise ValueError(
431
+ "Target found but no path information available. "
432
+ "This may indicate the entities are directly connected without intermediate steps."
433
+ )
434
+
435
+ # Clean the paths response
436
+ cleaned_paths = clean_gql_response(paths)
437
+
438
+ # Check if any paths contain QUERY entities (with safe null handling)
439
+ has_queries = any(
440
+ any(
441
+ entity.get("type") == "QUERY"
442
+ for entity in (path_obj.get("path") or [])
443
+ if entity # Skip None entities
444
+ )
445
+ for path_obj in cleaned_paths
446
+ if path_obj # Skip None path objects
447
+ )
448
+
449
+ # Build metadata
450
+ paths_metadata: dict[str, str] = {
451
+ "description": "Array of lineage paths showing transformation chains from source to target",
452
+ "structure": "Each path contains alternating entities (SCHEMA_FIELD or DATASET) and optional transformation QUERY entities",
453
+ }
454
+
455
+ # Add query enrichment note only when queries are present
456
+ if has_queries:
457
+ paths_metadata["queryEntities"] = (
458
+ "QUERY entities are returned as URNs only. "
459
+ "Use get_entities(query_urn) to fetch SQL statement and other query details."
460
+ )
461
+
462
+ metadata = {
463
+ "queryType": "lineage-path-trace",
464
+ "direction": semantic_direction,
465
+ "pathType": "column-level" if source_column else "dataset-level",
466
+ "fields": {"paths": paths_metadata},
467
+ }
468
+
469
+ # Build response with metadata
470
+ return {
471
+ "metadata": metadata,
472
+ "source": {
473
+ "urn": source_urn,
474
+ **({"column": source_column} if source_column else {}),
475
+ },
476
+ "target": {
477
+ "urn": target_urn,
478
+ **({"column": target_column} if target_column else {}),
479
+ },
480
+ "pathCount": len(cleaned_paths),
481
+ "paths": cleaned_paths,
482
+ }
483
+
484
+
485
+ def _find_lineage_path(
486
+ query_urn: str,
487
+ target_full_urn: str,
488
+ source_urn: str,
489
+ target_urn: str,
490
+ source_column: Optional[str],
491
+ target_column: Optional[str],
492
+ direction: Literal["upstream", "downstream"],
493
+ ) -> dict:
494
+ """Internal helper to find lineage path in a specific direction.
495
+
496
+ Always queries upstream internally (more efficient), but maintains the
497
+ semantic direction in the API. For downstream queries, swaps source/target
498
+ and reverses the path.
499
+
500
+ Separated from main function to support auto-discovery logic.
501
+
502
+ Args:
503
+ query_urn: URN to query lineage from
504
+ target_full_urn: Target URN to find
505
+ source_urn: Source dataset URN
506
+ target_urn: Target dataset URN
507
+ source_column: Source column name
508
+ target_column: Target column name
509
+ direction: Direction to search ("upstream" or "downstream")
510
+
511
+ Returns:
512
+ Dictionary with paths and metadata
513
+
514
+ Raises:
515
+ ItemNotFoundError: If no lineage path found
516
+ """
517
+ if direction == "downstream":
518
+ # User semantic: source flows TO target (source → target)
519
+ # Implementation: Query target's upstream to find source
520
+ # Then reverse the path to show source → target
521
+ result = _find_upstream_lineage_path(
522
+ query_urn=target_full_urn, # Query from target
523
+ search_for_urn=query_urn, # Search for source
524
+ source_urn=source_urn,
525
+ target_urn=target_urn,
526
+ source_column=source_column,
527
+ target_column=target_column,
528
+ semantic_direction=direction,
529
+ )
530
+
531
+ # Reverse paths to show source → target order
532
+ # Structure: result["paths"] = [{"path": [entity1, entity2, ...]}, ...]
533
+ # Each path is an array of entities (SCHEMA_FIELD/DATASET/QUERY)
534
+ # Upstream query returns: [target, query, intermediate, query, source]
535
+ # We reverse to: [source, query, intermediate, query, target]
536
+ for path_obj in result.get("paths", []):
537
+ if path_obj and "path" in path_obj:
538
+ path_obj["path"] = list(reversed(path_obj["path"]))
539
+
540
+ return result
541
+ else:
542
+ # User semantic: source depends ON target (target → source)
543
+ # Implementation: Query source's upstream to find target
544
+ # Path is already in correct order (target → source)
545
+ return _find_upstream_lineage_path(
546
+ query_urn=query_urn, # Query from source
547
+ search_for_urn=target_full_urn, # Search for target
548
+ source_urn=source_urn,
549
+ target_urn=target_urn,
550
+ source_column=source_column,
551
+ target_column=target_column,
552
+ semantic_direction=direction,
553
+ )
554
+
555
+
556
+ def get_lineage_paths_between(
557
+ source_urn: str,
558
+ target_urn: str,
559
+ source_column: Optional[str] = None,
560
+ target_column: Optional[str] = None,
561
+ direction: Optional[Literal["upstream", "downstream"]] = None,
562
+ ) -> dict:
563
+ """Get detailed lineage path(s) between two specific entities or columns.
564
+
565
+ Returns the paths array from searchAcrossLineage, showing the exact transformation
566
+ chain(s) including intermediate entities, columns, and transformation query URNs.
567
+
568
+ Unlike get_lineage() which returns all lineage targets with compact lineageColumns,
569
+ this tool focuses on ONE specific target and returns detailed path information.
570
+
571
+ Args:
572
+ source_urn: URN of the source dataset
573
+ target_urn: URN of the target dataset
574
+ source_column: Optional column name in source dataset
575
+ target_column: Optional column name in target dataset (required if source_column provided)
576
+ direction: Optional direction to search. If None (default), automatically discovers
577
+ the path by trying downstream first, then upstream. Specify "downstream" or
578
+ "upstream" explicitly for better performance if you know the direction.
579
+
580
+ Returns:
581
+ Dictionary with:
582
+ - source: Source entity/column info
583
+ - target: Target entity/column info
584
+ - paths: Array of path objects from GraphQL (with QUERY URNs)
585
+ - pathCount: Number of paths found
586
+ - metadata: Query metadata including direction and path type
587
+
588
+ Examples:
589
+ # Column-level paths
590
+ paths_result = get_lineage_paths_between(
591
+ source_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.base_table,PROD)",
592
+ target_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,db.schema.final_table,PROD)",
593
+ source_column="user_id",
594
+ target_column="customer_id"
595
+ )
596
+ # Returns paths with QUERY URNs showing transformation chain
597
+
598
+ # Fetch SQL for specific query of interest
599
+ query_details = get_entities(paths_result["paths"][0]["path"][1]["urn"])
600
+
601
+ # Dataset-level paths (auto-discover direction)
602
+ get_lineage_paths_between(
603
+ source_urn="urn:li:dataset:(...):base_table",
604
+ target_urn="urn:li:dataset:(...):final_table"
605
+ )
606
+
607
+ # Explicit direction for better performance
608
+ get_lineage_paths_between(
609
+ source_urn="urn:li:dataset:(...):base_table",
610
+ target_urn="urn:li:dataset:(...):final_table",
611
+ direction="downstream"
612
+ )
613
+
614
+ Example:
615
+ from datahub_agent_context.context import DataHubContext
616
+
617
+ with DataHubContext(client.graph):
618
+ result = get_lineage_paths_between(
619
+ source_urn="urn:li:dataset:(...)",
620
+ target_urn="urn:li:dataset:(...)"
621
+ )
622
+
623
+ Raises:
624
+ ValueError: If column parameters are mismatched or invalid
625
+ ItemNotFoundError: If no lineage path found
626
+ """
627
+ # Normalize column parameters
628
+ if source_column == "null" or source_column == "":
629
+ source_column = None
630
+ if target_column == "null" or target_column == "":
631
+ target_column = None
632
+
633
+ # Validate: if either column is specified, must be column-level lineage
634
+ if (source_column is None) != (target_column is None):
635
+ raise ValueError(
636
+ "Both source_column and target_column must be provided for column-level lineage, "
637
+ "or both must be None for dataset-level lineage"
638
+ )
639
+
640
+ # Convert to schema field URN if column specified
641
+ query_urn = maybe_convert_to_schema_field_urn(source_urn, source_column)
642
+ target_full_urn = maybe_convert_to_schema_field_urn(target_urn, target_column)
643
+
644
+ # Auto-discover direction if not specified
645
+ if direction is None:
646
+ # Try downstream first (more common use case)
647
+ try:
648
+ result = _find_lineage_path(
649
+ query_urn=query_urn,
650
+ target_full_urn=target_full_urn,
651
+ source_urn=source_urn,
652
+ target_urn=target_urn,
653
+ source_column=source_column,
654
+ target_column=target_column,
655
+ direction="downstream",
656
+ )
657
+ result["metadata"]["direction"] = "auto-discovered-downstream"
658
+ result["metadata"]["note"] = (
659
+ "Direction was automatically discovered. Specify direction='downstream' or 'upstream' explicitly for better performance."
660
+ )
661
+ return result
662
+ except ItemNotFoundError:
663
+ # Try upstream as fallback
664
+ try:
665
+ result = _find_lineage_path(
666
+ query_urn=query_urn,
667
+ target_full_urn=target_full_urn,
668
+ source_urn=source_urn,
669
+ target_urn=target_urn,
670
+ source_column=source_column,
671
+ target_column=target_column,
672
+ direction="upstream",
673
+ )
674
+ result["metadata"]["direction"] = "auto-discovered-upstream"
675
+ result["metadata"]["note"] = (
676
+ "Direction was automatically discovered. Specify direction='downstream' or 'upstream' explicitly for better performance."
677
+ )
678
+ return result
679
+ except ItemNotFoundError:
680
+ # Not found in either direction
681
+ raise ItemNotFoundError(
682
+ f"No lineage path found between {source_urn}"
683
+ + (f".{source_column}" if source_column else "")
684
+ + f" and {target_urn}"
685
+ + (f".{target_column}" if target_column else "")
686
+ + " in either upstream or downstream direction"
687
+ ) from None
688
+ else:
689
+ # User specified direction explicitly
690
+ return _find_lineage_path(
691
+ query_urn=query_urn,
692
+ target_full_urn=target_full_urn,
693
+ source_urn=source_urn,
694
+ target_urn=target_urn,
695
+ source_column=source_column,
696
+ target_column=target_column,
697
+ direction=direction,
698
+ )