collibra-connector 1.0.19__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,716 @@
1
+ """
2
+ Lineage Builder - Declarative data lineage creation.
3
+
4
+ This module provides a fluent API for building and committing
5
+ technical data lineage in Collibra, including assets and relations.
6
+
7
+ Example:
8
+ >>> from collibra_connector import CollibraConnector
9
+ >>> from collibra_connector.lineage import LineageBuilder, LineageNode
10
+ >>>
11
+ >>> conn = CollibraConnector(...)
12
+ >>> builder = LineageBuilder(conn)
13
+ >>>
14
+ >>> # Define nodes
15
+ >>> s3_table = LineageNode("s3://bucket/raw/customers", "Table")
16
+ >>> glue_job = LineageNode("etl_transform_customers", "Data Pipeline")
17
+ >>> redshift_table = LineageNode("warehouse.customers", "Table")
18
+ >>>
19
+ >>> # Build lineage
20
+ >>> builder.add_edge(s3_table, glue_job, "is source for")
21
+ >>> builder.add_edge(glue_job, redshift_table, "is target for")
22
+ >>>
23
+ >>> # Commit to Collibra
24
+ >>> result = builder.commit(domain_id="lineage-domain-uuid")
25
+ >>> print(f"Created {result.assets_created} assets, {result.relations_created} relations")
26
+ """
27
+ from __future__ import annotations
28
+
29
+ from dataclasses import dataclass, field
30
+ from enum import Enum
31
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING
32
+ from uuid import uuid4
33
+
34
+ if TYPE_CHECKING:
35
+ from .connector import CollibraConnector
36
+
37
+
38
+ class LineageDirection(str, Enum):
39
+ """Direction of data flow in lineage."""
40
+ UPSTREAM = "upstream" # Data flows TO this node
41
+ DOWNSTREAM = "downstream" # Data flows FROM this node
42
+ BIDIRECTIONAL = "bidirectional"
43
+
44
+
45
+ class LineageRelationType(str, Enum):
46
+ """Common lineage relation types."""
47
+ SOURCE_FOR = "is source for"
48
+ TARGET_FOR = "is target for"
49
+ TRANSFORMS = "transforms"
50
+ DERIVED_FROM = "is derived from"
51
+ CONTAINS = "contains"
52
+ PART_OF = "is part of"
53
+ USES = "uses"
54
+ PRODUCES = "produces"
55
+
56
+
57
+ @dataclass
58
+ class LineageNode:
59
+ """
60
+ Represents a node in the lineage graph.
61
+
62
+ A node can represent any data asset: a table, a file, an ETL job,
63
+ a dashboard, etc. Nodes can either reference existing Collibra assets
64
+ by ID or define new assets to be created.
65
+
66
+ Attributes:
67
+ name: The name of the node/asset.
68
+ asset_type: The asset type name (e.g., "Table", "Data Pipeline").
69
+ asset_id: Optional existing asset ID (if referencing existing asset).
70
+ display_name: Optional display name.
71
+ description: Optional description.
72
+ attributes: Optional dict of attribute name -> value.
73
+ metadata: Optional dict of additional metadata.
74
+
75
+ Example:
76
+ >>> # Reference existing asset
77
+ >>> existing = LineageNode.from_id("existing-asset-uuid")
78
+ >>>
79
+ >>> # Define new asset to create
80
+ >>> new_table = LineageNode(
81
+ ... name="raw.customers",
82
+ ... asset_type="Table",
83
+ ... description="Raw customer data from CRM",
84
+ ... attributes={"Data Source": "Salesforce"}
85
+ ... )
86
+ """
87
+ name: str
88
+ asset_type: str = "Data Asset"
89
+ asset_id: Optional[str] = None
90
+ display_name: Optional[str] = None
91
+ description: Optional[str] = None
92
+ attributes: Dict[str, Any] = field(default_factory=dict)
93
+ metadata: Dict[str, Any] = field(default_factory=dict)
94
+ _internal_id: str = field(default_factory=lambda: str(uuid4()))
95
+
96
+ @classmethod
97
+ def from_id(cls, asset_id: str, name: str = "") -> "LineageNode":
98
+ """Create a node referencing an existing Collibra asset."""
99
+ return cls(name=name, asset_id=asset_id)
100
+
101
+ @classmethod
102
+ def table(
103
+ cls,
104
+ name: str,
105
+ schema: Optional[str] = None,
106
+ database: Optional[str] = None,
107
+ **kwargs: Any
108
+ ) -> "LineageNode":
109
+ """Create a Table node."""
110
+ full_name = name
111
+ if schema:
112
+ full_name = f"{schema}.{name}"
113
+ if database:
114
+ full_name = f"{database}.{full_name}"
115
+ return cls(name=full_name, asset_type="Table", **kwargs)
116
+
117
+ @classmethod
118
+ def column(cls, name: str, table: Optional[str] = None, **kwargs: Any) -> "LineageNode":
119
+ """Create a Column node."""
120
+ full_name = f"{table}.{name}" if table else name
121
+ return cls(name=full_name, asset_type="Column", **kwargs)
122
+
123
+ @classmethod
124
+ def pipeline(cls, name: str, **kwargs: Any) -> "LineageNode":
125
+ """Create a Data Pipeline node."""
126
+ return cls(name=name, asset_type="Data Pipeline", **kwargs)
127
+
128
+ @classmethod
129
+ def report(cls, name: str, **kwargs: Any) -> "LineageNode":
130
+ """Create a Report node."""
131
+ return cls(name=name, asset_type="Report", **kwargs)
132
+
133
+ @classmethod
134
+ def dashboard(cls, name: str, **kwargs: Any) -> "LineageNode":
135
+ """Create a Dashboard node."""
136
+ return cls(name=name, asset_type="Dashboard", **kwargs)
137
+
138
+ def __hash__(self) -> int:
139
+ return hash(self._internal_id)
140
+
141
+ def __eq__(self, other: object) -> bool:
142
+ if isinstance(other, LineageNode):
143
+ return self._internal_id == other._internal_id
144
+ return False
145
+
146
+
147
+ @dataclass
148
+ class LineageEdge:
149
+ """
150
+ Represents an edge (relation) in the lineage graph.
151
+
152
+ An edge connects two nodes with a specific relation type,
153
+ representing data flow or dependency.
154
+
155
+ Attributes:
156
+ source: The source node.
157
+ target: The target node.
158
+ relation_type: The type of relation.
159
+ relation_type_id: Optional specific relation type UUID.
160
+ metadata: Optional additional metadata for the edge.
161
+ """
162
+ source: LineageNode
163
+ target: LineageNode
164
+ relation_type: str = "is source for"
165
+ relation_type_id: Optional[str] = None
166
+ metadata: Dict[str, Any] = field(default_factory=dict)
167
+
168
+
169
+ @dataclass
170
+ class LineageCommitResult:
171
+ """
172
+ Result of committing lineage to Collibra.
173
+
174
+ Attributes:
175
+ success: Whether the commit was successful.
176
+ assets_created: Number of new assets created.
177
+ assets_updated: Number of existing assets updated.
178
+ relations_created: Number of relations created.
179
+ assets: Dict mapping node internal IDs to created asset IDs.
180
+ relations: List of created relation IDs.
181
+ errors: List of error messages.
182
+ """
183
+ success: bool = True
184
+ assets_created: int = 0
185
+ assets_updated: int = 0
186
+ relations_created: int = 0
187
+ assets: Dict[str, str] = field(default_factory=dict)
188
+ relations: List[str] = field(default_factory=list)
189
+ errors: List[str] = field(default_factory=list)
190
+
191
+
192
+ class LineageBuilder:
193
+ """
194
+ Fluent builder for creating data lineage in Collibra.
195
+
196
+ This class provides a declarative API for defining lineage graphs
197
+ and committing them to Collibra. It handles asset creation,
198
+ relation creation, and error handling automatically.
199
+
200
+ Example:
201
+ >>> builder = LineageBuilder(connector)
202
+ >>>
203
+ >>> # Define the lineage
204
+ >>> source = LineageNode.table("raw.orders", database="s3")
205
+ >>> transform = LineageNode.pipeline("transform_orders")
206
+ >>> target = LineageNode.table("orders", schema="warehouse")
207
+ >>>
208
+ >>> builder.add_edge(source, transform, "is source for")
209
+ >>> builder.add_edge(transform, target, "is source for")
210
+ >>>
211
+ >>> # Or use method chaining
212
+ >>> builder.source(source).through(transform).to(target)
213
+ >>>
214
+ >>> # Commit
215
+ >>> result = builder.commit(domain_id="lineage-domain-uuid")
216
+
217
+ Advanced Example with multiple paths:
218
+ >>> # Multiple sources to one target
219
+ >>> builder.add_edges([
220
+ ... (table_a, etl_job, "is source for"),
221
+ ... (table_b, etl_job, "is source for"),
222
+ ... (table_c, etl_job, "is source for"),
223
+ ... (etl_job, output_table, "is source for"),
224
+ ... ])
225
+ """
226
+
227
+ def __init__(
228
+ self,
229
+ connector: "CollibraConnector",
230
+ default_relation_type: str = "is source for"
231
+ ) -> None:
232
+ """
233
+ Initialize the LineageBuilder.
234
+
235
+ Args:
236
+ connector: The CollibraConnector instance.
237
+ default_relation_type: Default relation type for edges.
238
+ """
239
+ self.connector = connector
240
+ self.default_relation_type = default_relation_type
241
+ self._nodes: Dict[str, LineageNode] = {}
242
+ self._edges: List[LineageEdge] = []
243
+ self._current_source: Optional[LineageNode] = None
244
+ self._type_id_cache: Dict[str, str] = {}
245
+ self._relation_type_cache: Dict[str, str] = {}
246
+
247
+ def add_node(self, node: LineageNode) -> "LineageBuilder":
248
+ """
249
+ Add a node to the lineage graph.
250
+
251
+ Args:
252
+ node: The LineageNode to add.
253
+
254
+ Returns:
255
+ Self for method chaining.
256
+ """
257
+ self._nodes[node._internal_id] = node
258
+ return self
259
+
260
+ def add_edge(
261
+ self,
262
+ source: LineageNode,
263
+ target: LineageNode,
264
+ relation_type: Optional[str] = None,
265
+ relation_type_id: Optional[str] = None
266
+ ) -> "LineageBuilder":
267
+ """
268
+ Add an edge (relation) between two nodes.
269
+
270
+ Args:
271
+ source: The source node.
272
+ target: The target node.
273
+ relation_type: Relation type name (uses default if not specified).
274
+ relation_type_id: Optional specific relation type UUID.
275
+
276
+ Returns:
277
+ Self for method chaining.
278
+ """
279
+ # Auto-add nodes
280
+ self.add_node(source)
281
+ self.add_node(target)
282
+
283
+ edge = LineageEdge(
284
+ source=source,
285
+ target=target,
286
+ relation_type=relation_type or self.default_relation_type,
287
+ relation_type_id=relation_type_id
288
+ )
289
+ self._edges.append(edge)
290
+ return self
291
+
292
+ def add_edges(
293
+ self,
294
+ edges: List[Tuple[LineageNode, LineageNode, str]]
295
+ ) -> "LineageBuilder":
296
+ """
297
+ Add multiple edges at once.
298
+
299
+ Args:
300
+ edges: List of (source, target, relation_type) tuples.
301
+
302
+ Returns:
303
+ Self for method chaining.
304
+ """
305
+ for source, target, relation_type in edges:
306
+ self.add_edge(source, target, relation_type)
307
+ return self
308
+
309
+ def source(self, node: LineageNode) -> "LineageBuilder":
310
+ """
311
+ Set the current source node for fluent API.
312
+
313
+ Args:
314
+ node: The source node.
315
+
316
+ Returns:
317
+ Self for method chaining.
318
+
319
+ Example:
320
+ >>> builder.source(table_a).to(table_b)
321
+ """
322
+ self.add_node(node)
323
+ self._current_source = node
324
+ return self
325
+
326
+ def through(
327
+ self,
328
+ node: LineageNode,
329
+ relation_type: Optional[str] = None
330
+ ) -> "LineageBuilder":
331
+ """
332
+ Add an intermediate node (like an ETL job).
333
+
334
+ Args:
335
+ node: The intermediate node.
336
+ relation_type: Relation type from source to this node.
337
+
338
+ Returns:
339
+ Self for method chaining.
340
+
341
+ Example:
342
+ >>> builder.source(raw_table).through(etl_job).to(warehouse_table)
343
+ """
344
+ if self._current_source is None:
345
+ raise ValueError("Must call source() before through()")
346
+
347
+ self.add_edge(self._current_source, node, relation_type)
348
+ self._current_source = node
349
+ return self
350
+
351
+ def to(
352
+ self,
353
+ node: LineageNode,
354
+ relation_type: Optional[str] = None
355
+ ) -> "LineageBuilder":
356
+ """
357
+ Add the target node and create edge from current source.
358
+
359
+ Args:
360
+ node: The target node.
361
+ relation_type: Relation type to target.
362
+
363
+ Returns:
364
+ Self for method chaining.
365
+
366
+ Example:
367
+ >>> builder.source(table_a).to(table_b)
368
+ """
369
+ if self._current_source is None:
370
+ raise ValueError("Must call source() before to()")
371
+
372
+ self.add_edge(self._current_source, node, relation_type)
373
+ return self
374
+
375
+ def chain(
376
+ self,
377
+ *nodes: LineageNode,
378
+ relation_type: Optional[str] = None
379
+ ) -> "LineageBuilder":
380
+ """
381
+ Create a chain of nodes with edges between consecutive pairs.
382
+
383
+ Args:
384
+ *nodes: Nodes to chain together.
385
+ relation_type: Relation type for all edges.
386
+
387
+ Returns:
388
+ Self for method chaining.
389
+
390
+ Example:
391
+ >>> builder.chain(source, transform1, transform2, target)
392
+ """
393
+ for i in range(len(nodes) - 1):
394
+ self.add_edge(nodes[i], nodes[i + 1], relation_type)
395
+ return self
396
+
397
+ def fan_in(
398
+ self,
399
+ sources: List[LineageNode],
400
+ target: LineageNode,
401
+ relation_type: Optional[str] = None
402
+ ) -> "LineageBuilder":
403
+ """
404
+ Multiple sources feeding into one target.
405
+
406
+ Args:
407
+ sources: List of source nodes.
408
+ target: The target node.
409
+ relation_type: Relation type for all edges.
410
+
411
+ Returns:
412
+ Self for method chaining.
413
+
414
+ Example:
415
+ >>> # Multiple tables feeding into one ETL job
416
+ >>> builder.fan_in([table_a, table_b, table_c], etl_job)
417
+ """
418
+ for source in sources:
419
+ self.add_edge(source, target, relation_type)
420
+ return self
421
+
422
+ def fan_out(
423
+ self,
424
+ source: LineageNode,
425
+ targets: List[LineageNode],
426
+ relation_type: Optional[str] = None
427
+ ) -> "LineageBuilder":
428
+ """
429
+ One source feeding into multiple targets.
430
+
431
+ Args:
432
+ source: The source node.
433
+ targets: List of target nodes.
434
+ relation_type: Relation type for all edges.
435
+
436
+ Returns:
437
+ Self for method chaining.
438
+
439
+ Example:
440
+ >>> # One ETL job producing multiple tables
441
+ >>> builder.fan_out(etl_job, [table_a, table_b, table_c])
442
+ """
443
+ for target in targets:
444
+ self.add_edge(source, target, relation_type)
445
+ return self
446
+
447
+ def _resolve_asset_type_id(self, type_name: str) -> Optional[str]:
448
+ """Get asset type ID by name, with caching."""
449
+ if type_name in self._type_id_cache:
450
+ return self._type_id_cache[type_name]
451
+
452
+ try:
453
+ # Try to find the asset type
454
+ result = self.connector.metadata.get_asset_types(name=type_name, limit=1)
455
+ types = result.get("results", [])
456
+ if types:
457
+ type_id = types[0].get("id")
458
+ self._type_id_cache[type_name] = type_id
459
+ return type_id
460
+ except Exception:
461
+ pass
462
+
463
+ return None
464
+
465
+ def _resolve_relation_type_id(self, role: str) -> Optional[str]:
466
+ """Get relation type ID by role name, with caching."""
467
+ if role in self._relation_type_cache:
468
+ return self._relation_type_cache[role]
469
+
470
+ try:
471
+ # Search for relation type by role
472
+ result = self.connector.metadata.get_relation_types(role=role, limit=1)
473
+ types = result.get("results", [])
474
+ if types:
475
+ type_id = types[0].get("id")
476
+ self._relation_type_cache[role] = type_id
477
+ return type_id
478
+ except Exception:
479
+ pass
480
+
481
+ return None
482
+
483
+ def commit(
484
+ self,
485
+ domain_id: str,
486
+ status_id: Optional[str] = None,
487
+ dry_run: bool = False,
488
+ create_missing_types: bool = False
489
+ ) -> LineageCommitResult:
490
+ """
491
+ Commit the lineage graph to Collibra.
492
+
493
+ This method:
494
+ 1. Creates any new assets defined in nodes
495
+ 2. Creates relations between assets
496
+ 3. Sets attributes on new assets
497
+
498
+ Args:
499
+ domain_id: The domain to create assets in.
500
+ status_id: Optional status for new assets.
501
+ dry_run: If True, validate but don't create anything.
502
+ create_missing_types: If True, log warnings for missing types.
503
+
504
+ Returns:
505
+ LineageCommitResult with success status and created IDs.
506
+
507
+ Example:
508
+ >>> result = builder.commit(domain_id="lineage-domain")
509
+ >>> if result.success:
510
+ ... print(f"Created {result.assets_created} assets")
511
+ ... print(f"Created {result.relations_created} relations")
512
+ >>> else:
513
+ ... for error in result.errors:
514
+ ... print(f"Error: {error}")
515
+ """
516
+ result = LineageCommitResult()
517
+
518
+ if dry_run:
519
+ result.assets_created = len([n for n in self._nodes.values() if not n.asset_id])
520
+ result.relations_created = len(self._edges)
521
+ return result
522
+
523
+ # Phase 1: Create or resolve all assets
524
+ for internal_id, node in self._nodes.items():
525
+ try:
526
+ if node.asset_id:
527
+ # Existing asset - just map the ID
528
+ result.assets[internal_id] = node.asset_id
529
+ else:
530
+ # New asset - create it
531
+ type_id = self._resolve_asset_type_id(node.asset_type)
532
+
533
+ if not type_id:
534
+ result.errors.append(
535
+ f"Asset type not found: {node.asset_type} for node {node.name}"
536
+ )
537
+ continue
538
+
539
+ asset_data = {
540
+ "name": node.name,
541
+ "domain_id": domain_id,
542
+ "type_id": type_id,
543
+ }
544
+ if node.display_name:
545
+ asset_data["display_name"] = node.display_name
546
+ if status_id:
547
+ asset_data["status_id"] = status_id
548
+
549
+ created = self.connector.asset.add_asset(**asset_data)
550
+ asset_id = created.get("id")
551
+ result.assets[internal_id] = asset_id
552
+ result.assets_created += 1
553
+
554
+ # Set description attribute if provided
555
+ if node.description:
556
+ try:
557
+ self.connector.asset.set_asset_attributes(
558
+ asset_id=asset_id,
559
+ type_public_id="Description",
560
+ values=[node.description]
561
+ )
562
+ except Exception:
563
+ pass # Description is optional
564
+
565
+ # Set custom attributes
566
+ for attr_name, attr_value in node.attributes.items():
567
+ try:
568
+ self.connector.asset.set_asset_attributes(
569
+ asset_id=asset_id,
570
+ type_public_id=attr_name,
571
+ values=[attr_value]
572
+ )
573
+ except Exception:
574
+ pass # Custom attributes are optional
575
+
576
+ except Exception as e:
577
+ result.errors.append(f"Failed to create asset {node.name}: {str(e)}")
578
+ result.success = False
579
+
580
+ # Phase 2: Create relations
581
+ for edge in self._edges:
582
+ try:
583
+ source_id = result.assets.get(edge.source._internal_id)
584
+ target_id = result.assets.get(edge.target._internal_id)
585
+
586
+ if not source_id:
587
+ result.errors.append(
588
+ f"Source asset not found for edge: {edge.source.name}"
589
+ )
590
+ continue
591
+
592
+ if not target_id:
593
+ result.errors.append(
594
+ f"Target asset not found for edge: {edge.target.name}"
595
+ )
596
+ continue
597
+
598
+ # Resolve relation type
599
+ relation_type_id = edge.relation_type_id
600
+ if not relation_type_id:
601
+ relation_type_id = self._resolve_relation_type_id(edge.relation_type)
602
+
603
+ if not relation_type_id:
604
+ result.errors.append(
605
+ f"Relation type not found: {edge.relation_type}"
606
+ )
607
+ continue
608
+
609
+ created = self.connector.relation.add_relation(
610
+ source_id=source_id,
611
+ target_id=target_id,
612
+ type_id=relation_type_id
613
+ )
614
+ relation_id = created.get("id")
615
+ result.relations.append(relation_id)
616
+ result.relations_created += 1
617
+
618
+ except Exception as e:
619
+ result.errors.append(
620
+ f"Failed to create relation {edge.source.name} -> {edge.target.name}: {str(e)}"
621
+ )
622
+ result.success = False
623
+
624
+ if result.errors:
625
+ result.success = False
626
+
627
+ return result
628
+
629
+ def clear(self) -> "LineageBuilder":
630
+ """Clear all nodes and edges."""
631
+ self._nodes.clear()
632
+ self._edges.clear()
633
+ self._current_source = None
634
+ return self
635
+
636
+ def get_nodes(self) -> List[LineageNode]:
637
+ """Get all nodes in the graph."""
638
+ return list(self._nodes.values())
639
+
640
+ def get_edges(self) -> List[LineageEdge]:
641
+ """Get all edges in the graph."""
642
+ return list(self._edges)
643
+
644
+ def to_dict(self) -> Dict[str, Any]:
645
+ """Export the lineage graph as a dictionary."""
646
+ return {
647
+ "nodes": [
648
+ {
649
+ "id": n._internal_id,
650
+ "name": n.name,
651
+ "asset_type": n.asset_type,
652
+ "asset_id": n.asset_id,
653
+ "description": n.description,
654
+ "attributes": n.attributes
655
+ }
656
+ for n in self._nodes.values()
657
+ ],
658
+ "edges": [
659
+ {
660
+ "source": e.source._internal_id,
661
+ "target": e.target._internal_id,
662
+ "relation_type": e.relation_type
663
+ }
664
+ for e in self._edges
665
+ ]
666
+ }
667
+
668
+ def from_dict(self, data: Dict[str, Any]) -> "LineageBuilder":
669
+ """Import lineage graph from a dictionary."""
670
+ self.clear()
671
+
672
+ node_map: Dict[str, LineageNode] = {}
673
+
674
+ for node_data in data.get("nodes", []):
675
+ node = LineageNode(
676
+ name=node_data["name"],
677
+ asset_type=node_data.get("asset_type", "Data Asset"),
678
+ asset_id=node_data.get("asset_id"),
679
+ description=node_data.get("description"),
680
+ attributes=node_data.get("attributes", {})
681
+ )
682
+ node._internal_id = node_data.get("id", node._internal_id)
683
+ node_map[node._internal_id] = node
684
+ self.add_node(node)
685
+
686
+ for edge_data in data.get("edges", []):
687
+ source = node_map.get(edge_data["source"])
688
+ target = node_map.get(edge_data["target"])
689
+ if source and target:
690
+ self.add_edge(source, target, edge_data.get("relation_type"))
691
+
692
+ return self
693
+
694
+ def visualize(self) -> str:
695
+ """
696
+ Generate a simple ASCII visualization of the lineage.
697
+
698
+ Returns:
699
+ ASCII representation of the lineage graph.
700
+ """
701
+ lines = ["Lineage Graph:", "=" * 40]
702
+
703
+ # Group edges by source
704
+ by_source: Dict[str, List[LineageEdge]] = {}
705
+ for edge in self._edges:
706
+ key = edge.source.name
707
+ if key not in by_source:
708
+ by_source[key] = []
709
+ by_source[key].append(edge)
710
+
711
+ for source_name, edges in by_source.items():
712
+ lines.append(f"\n[{source_name}]")
713
+ for edge in edges:
714
+ lines.append(f" --({edge.relation_type})--> [{edge.target.name}]")
715
+
716
+ return "\n".join(lines)