collibra-connector 1.0.19__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,845 @@
1
+ """
2
+ Helper utilities for the Collibra Connector.
3
+
4
+ This module provides utility classes and functions for:
5
+ - Pagination handling
6
+ - Batch operations
7
+ - Data transformations
8
+ - Caching
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ import functools
14
+ from typing import (
15
+ Any,
16
+ Callable,
17
+ Dict,
18
+ Generator,
19
+ Iterator,
20
+ List,
21
+ Optional,
22
+ TypeVar,
23
+ Union,
24
+ TYPE_CHECKING,
25
+ )
26
+ from dataclasses import dataclass, field
27
+ from datetime import datetime, timedelta
28
+ from threading import Lock
29
+
30
+ if TYPE_CHECKING:
31
+ from .connector import CollibraConnector
32
+
33
+
34
+ T = TypeVar('T')
35
+
36
+
37
+ @dataclass
38
+ class PaginatedResponse:
39
+ """
40
+ Represents a paginated API response.
41
+
42
+ Attributes:
43
+ results: List of items in the current page.
44
+ total: Total number of items available.
45
+ offset: Current offset in the result set.
46
+ limit: Number of items per page.
47
+ next_cursor: Cursor for the next page (if using cursor pagination).
48
+ """
49
+ results: List[Dict[str, Any]]
50
+ total: int
51
+ offset: int = 0
52
+ limit: int = 0
53
+ next_cursor: Optional[str] = None
54
+
55
+ @classmethod
56
+ def from_response(cls, response: Dict[str, Any]) -> "PaginatedResponse":
57
+ """Create a PaginatedResponse from an API response dict."""
58
+ return cls(
59
+ results=response.get("results", []),
60
+ total=response.get("total", 0),
61
+ offset=response.get("offset", 0),
62
+ limit=response.get("limit", 0),
63
+ next_cursor=response.get("nextCursor"),
64
+ )
65
+
66
+ def has_more(self) -> bool:
67
+ """Check if there are more pages available."""
68
+ if self.next_cursor:
69
+ return True
70
+ return self.offset + len(self.results) < self.total
71
+
72
+ def __len__(self) -> int:
73
+ """Return the number of results in this page."""
74
+ return len(self.results)
75
+
76
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
77
+ """Iterate over results in this page."""
78
+ return iter(self.results)
79
+
80
+
81
+ class Paginator:
82
+ """
83
+ Helper class for iterating over paginated API results.
84
+
85
+ Provides both page-by-page and item-by-item iteration over
86
+ large result sets without loading everything into memory.
87
+
88
+ Example:
89
+ >>> paginator = Paginator(connector.asset.find_assets, limit=100)
90
+ >>> for asset in paginator.items():
91
+ ... print(asset['name'])
92
+
93
+ >>> # Or iterate by pages
94
+ >>> for page in paginator.pages():
95
+ ... print(f"Processing {len(page)} items")
96
+ """
97
+
98
+ def __init__(
99
+ self,
100
+ fetch_func: Callable[..., Dict[str, Any]],
101
+ limit: int = 100,
102
+ max_items: Optional[int] = None,
103
+ use_cursor: bool = False,
104
+ **kwargs: Any
105
+ ) -> None:
106
+ """
107
+ Initialize the paginator.
108
+
109
+ Args:
110
+ fetch_func: The API method to call for fetching results.
111
+ limit: Number of items to fetch per page.
112
+ max_items: Maximum total items to fetch (None for all).
113
+ use_cursor: Whether to use cursor-based pagination.
114
+ **kwargs: Additional arguments to pass to fetch_func.
115
+ """
116
+ self.fetch_func = fetch_func
117
+ self.limit = limit
118
+ self.max_items = max_items
119
+ self.use_cursor = use_cursor
120
+ self.kwargs = kwargs
121
+ self._total_fetched = 0
122
+
123
+ def pages(self) -> Generator[PaginatedResponse, None, None]:
124
+ """
125
+ Iterate over pages of results.
126
+
127
+ Yields:
128
+ PaginatedResponse objects for each page.
129
+ """
130
+ offset = 0
131
+ cursor: Optional[str] = "" if self.use_cursor else None
132
+
133
+ while True:
134
+ # Check if we've reached max_items
135
+ if self.max_items and self._total_fetched >= self.max_items:
136
+ break
137
+
138
+ # Calculate limit for this request
139
+ current_limit = self.limit
140
+ if self.max_items:
141
+ remaining = self.max_items - self._total_fetched
142
+ current_limit = min(self.limit, remaining)
143
+
144
+ # Build request parameters
145
+ params = {**self.kwargs, "limit": current_limit}
146
+
147
+ if self.use_cursor and cursor is not None:
148
+ params["cursor"] = cursor
149
+ else:
150
+ params["offset"] = offset
151
+
152
+ # Fetch the page
153
+ response = self.fetch_func(**params)
154
+ page = PaginatedResponse.from_response(response)
155
+
156
+ if not page.results:
157
+ break
158
+
159
+ self._total_fetched += len(page.results)
160
+ yield page
161
+
162
+ # Prepare for next iteration
163
+ if self.use_cursor:
164
+ cursor = page.next_cursor
165
+ if not cursor:
166
+ break
167
+ else:
168
+ offset += len(page.results)
169
+ if not page.has_more():
170
+ break
171
+
172
+ def items(self) -> Generator[Dict[str, Any], None, None]:
173
+ """
174
+ Iterate over individual items across all pages.
175
+
176
+ Yields:
177
+ Individual result dictionaries.
178
+ """
179
+ for page in self.pages():
180
+ yield from page.results
181
+
182
+ def collect(self) -> List[Dict[str, Any]]:
183
+ """
184
+ Collect all items into a list.
185
+
186
+ Warning: This loads all results into memory. Use items()
187
+ for memory-efficient iteration over large datasets.
188
+
189
+ Returns:
190
+ List of all result dictionaries.
191
+ """
192
+ return list(self.items())
193
+
194
+ @property
195
+ def total_fetched(self) -> int:
196
+ """Get the total number of items fetched so far."""
197
+ return self._total_fetched
198
+
199
+
200
+ class BatchProcessor:
201
+ """
202
+ Helper class for processing items in batches.
203
+
204
+ Useful for bulk operations like creating or updating multiple
205
+ assets, while respecting API rate limits.
206
+
207
+ Example:
208
+ >>> processor = BatchProcessor(batch_size=50, delay=0.5)
209
+ >>> results = processor.process(
210
+ ... items=assets_to_create,
211
+ ... operation=connector.asset.add_asset,
212
+ ... item_mapper=lambda a: {'name': a['name'], 'domain_id': a['domain']}
213
+ ... )
214
+ """
215
+
216
+ def __init__(
217
+ self,
218
+ batch_size: int = 50,
219
+ delay: float = 0.1,
220
+ on_error: str = "continue"
221
+ ) -> None:
222
+ """
223
+ Initialize the batch processor.
224
+
225
+ Args:
226
+ batch_size: Number of items to process per batch.
227
+ delay: Delay in seconds between batches.
228
+ on_error: Error handling strategy: "continue", "stop", or "collect".
229
+ """
230
+ self.batch_size = batch_size
231
+ self.delay = delay
232
+ self.on_error = on_error
233
+
234
+ def process(
235
+ self,
236
+ items: List[T],
237
+ operation: Callable[..., Any],
238
+ item_mapper: Optional[Callable[[T], Dict[str, Any]]] = None,
239
+ progress_callback: Optional[Callable[[int, int], None]] = None
240
+ ) -> "BatchResult":
241
+ """
242
+ Process items in batches.
243
+
244
+ Args:
245
+ items: List of items to process.
246
+ operation: The API operation to call for each item.
247
+ item_mapper: Optional function to transform items into operation kwargs.
248
+ progress_callback: Optional callback(processed, total) for progress updates.
249
+
250
+ Returns:
251
+ BatchResult with success/failure information.
252
+ """
253
+ result = BatchResult()
254
+ total = len(items)
255
+
256
+ for i, item in enumerate(items):
257
+ try:
258
+ # Transform item if mapper provided
259
+ kwargs = item_mapper(item) if item_mapper else item
260
+
261
+ # Execute operation
262
+ response = operation(**kwargs) if isinstance(kwargs, dict) else operation(kwargs)
263
+ result.add_success(item, response)
264
+
265
+ except Exception as e:
266
+ result.add_error(item, e)
267
+
268
+ if self.on_error == "stop":
269
+ break
270
+
271
+ # Progress callback
272
+ if progress_callback:
273
+ progress_callback(i + 1, total)
274
+
275
+ # Delay between batches
276
+ if (i + 1) % self.batch_size == 0 and i + 1 < total:
277
+ time.sleep(self.delay)
278
+
279
+ return result
280
+
281
+
282
+ @dataclass
283
+ class BatchResult:
284
+ """
285
+ Result of a batch processing operation.
286
+
287
+ Attributes:
288
+ successes: List of (item, response) tuples for successful operations.
289
+ errors: List of (item, exception) tuples for failed operations.
290
+ """
291
+ successes: List[tuple] = field(default_factory=list)
292
+ errors: List[tuple] = field(default_factory=list)
293
+
294
+ def add_success(self, item: Any, response: Any) -> None:
295
+ """Add a successful result."""
296
+ self.successes.append((item, response))
297
+
298
+ def add_error(self, item: Any, error: Exception) -> None:
299
+ """Add an error result."""
300
+ self.errors.append((item, error))
301
+
302
+ @property
303
+ def success_count(self) -> int:
304
+ """Get the number of successful operations."""
305
+ return len(self.successes)
306
+
307
+ @property
308
+ def error_count(self) -> int:
309
+ """Get the number of failed operations."""
310
+ return len(self.errors)
311
+
312
+ @property
313
+ def total_count(self) -> int:
314
+ """Get the total number of operations."""
315
+ return self.success_count + self.error_count
316
+
317
+ @property
318
+ def success_rate(self) -> float:
319
+ """Get the success rate as a percentage."""
320
+ if self.total_count == 0:
321
+ return 0.0
322
+ return (self.success_count / self.total_count) * 100
323
+
324
+ def __repr__(self) -> str:
325
+ return f"BatchResult(successes={self.success_count}, errors={self.error_count})"
326
+
327
+
328
+ class CachedMetadata:
329
+ """
330
+ Thread-safe cache for Collibra metadata like UUIDs.
331
+
332
+ Caches metadata to avoid repeated API calls for frequently
333
+ accessed data like asset types, statuses, and attributes.
334
+
335
+ Example:
336
+ >>> cache = CachedMetadata(connector, ttl=3600)
337
+ >>> asset_type_id = cache.get_asset_type_id("Business Term")
338
+ >>> status_id = cache.get_status_id("Approved")
339
+ """
340
+
341
+ def __init__(
342
+ self,
343
+ connector: "CollibraConnector",
344
+ ttl: int = 3600
345
+ ) -> None:
346
+ """
347
+ Initialize the metadata cache.
348
+
349
+ Args:
350
+ connector: The CollibraConnector instance.
351
+ ttl: Time-to-live in seconds for cached data.
352
+ """
353
+ self.connector = connector
354
+ self.ttl = ttl
355
+ self._cache: Dict[str, Dict[str, Any]] = {}
356
+ self._timestamps: Dict[str, datetime] = {}
357
+ self._lock = Lock()
358
+
359
+ def _is_expired(self, key: str) -> bool:
360
+ """Check if a cache entry has expired."""
361
+ if key not in self._timestamps:
362
+ return True
363
+ return datetime.now() - self._timestamps[key] > timedelta(seconds=self.ttl)
364
+
365
+ def _refresh_if_needed(self, category: str) -> None:
366
+ """Refresh cache for a category if expired."""
367
+ with self._lock:
368
+ if self._is_expired(category):
369
+ self._refresh_category(category)
370
+
371
+ def _refresh_category(self, category: str) -> None:
372
+ """Refresh a specific category of metadata."""
373
+ try:
374
+ if category == "asset_types":
375
+ data = self._fetch_all_pages(
376
+ self.connector.metadata.get_asset_types
377
+ if hasattr(self.connector.metadata, 'get_asset_types')
378
+ else lambda **kw: self._get_via_base_api("/assetTypes", **kw)
379
+ )
380
+ self._cache["asset_types"] = {item["name"]: item["id"] for item in data}
381
+
382
+ elif category == "statuses":
383
+ data = self._fetch_all_pages(
384
+ lambda **kw: self._get_via_base_api("/statuses", **kw)
385
+ )
386
+ self._cache["statuses"] = {item["name"]: item["id"] for item in data}
387
+
388
+ elif category == "attribute_types":
389
+ data = self._fetch_all_pages(
390
+ lambda **kw: self._get_via_base_api("/attributeTypes", **kw)
391
+ )
392
+ self._cache["attribute_types"] = {item["name"]: item["id"] for item in data}
393
+
394
+ elif category == "domain_types":
395
+ data = self._fetch_all_pages(
396
+ lambda **kw: self._get_via_base_api("/domainTypes", **kw)
397
+ )
398
+ self._cache["domain_types"] = {item["name"]: item["id"] for item in data}
399
+
400
+ elif category == "relation_types":
401
+ data = self._fetch_all_pages(
402
+ lambda **kw: self._get_via_base_api("/relationTypes", **kw)
403
+ )
404
+ self._cache["relation_types"] = {
405
+ f"{item['sourceType']['name']}_{item['targetType']['name']}": item["id"]
406
+ for item in data
407
+ }
408
+
409
+ elif category == "roles":
410
+ data = self._fetch_all_pages(
411
+ lambda **kw: self._get_via_base_api("/roles", **kw)
412
+ )
413
+ self._cache["roles"] = {item["name"]: item["id"] for item in data}
414
+
415
+ self._timestamps[category] = datetime.now()
416
+
417
+ except Exception:
418
+ # On error, set empty cache to avoid repeated failures
419
+ self._cache[category] = {}
420
+ self._timestamps[category] = datetime.now()
421
+
422
+ def _get_via_base_api(self, endpoint: str, **kwargs: Any) -> Dict[str, Any]:
423
+ """Make a request via the base API."""
424
+ import requests
425
+ url = f"{self.connector.api}{endpoint}"
426
+ response = requests.get(
427
+ url,
428
+ auth=self.connector.auth,
429
+ params=kwargs,
430
+ timeout=self.connector.timeout
431
+ )
432
+ response.raise_for_status()
433
+ return response.json()
434
+
435
+ def _fetch_all_pages(self, fetch_func: Callable[..., Dict[str, Any]]) -> List[Dict[str, Any]]:
436
+ """Fetch all pages of a paginated endpoint."""
437
+ all_results = []
438
+ offset = 0
439
+ limit = 1000
440
+
441
+ while True:
442
+ response = fetch_func(offset=offset, limit=limit)
443
+ results = response.get("results", [])
444
+ all_results.extend(results)
445
+
446
+ if len(results) < limit:
447
+ break
448
+ offset += limit
449
+
450
+ return all_results
451
+
452
+ def get_asset_type_id(self, name: str) -> Optional[str]:
453
+ """Get asset type UUID by name."""
454
+ self._refresh_if_needed("asset_types")
455
+ return self._cache.get("asset_types", {}).get(name)
456
+
457
+ def get_status_id(self, name: str) -> Optional[str]:
458
+ """Get status UUID by name."""
459
+ self._refresh_if_needed("statuses")
460
+ return self._cache.get("statuses", {}).get(name)
461
+
462
+ def get_attribute_type_id(self, name: str) -> Optional[str]:
463
+ """Get attribute type UUID by name."""
464
+ self._refresh_if_needed("attribute_types")
465
+ return self._cache.get("attribute_types", {}).get(name)
466
+
467
+ def get_domain_type_id(self, name: str) -> Optional[str]:
468
+ """Get domain type UUID by name."""
469
+ self._refresh_if_needed("domain_types")
470
+ return self._cache.get("domain_types", {}).get(name)
471
+
472
+ def get_role_id(self, name: str) -> Optional[str]:
473
+ """Get role UUID by name."""
474
+ self._refresh_if_needed("roles")
475
+ return self._cache.get("roles", {}).get(name)
476
+
477
+ def clear(self) -> None:
478
+ """Clear all cached data."""
479
+ with self._lock:
480
+ self._cache.clear()
481
+ self._timestamps.clear()
482
+
483
+ def refresh_all(self) -> None:
484
+ """Force refresh of all cached data."""
485
+ categories = ["asset_types", "statuses", "attribute_types",
486
+ "domain_types", "relation_types", "roles"]
487
+ for category in categories:
488
+ self._refresh_category(category)
489
+
490
+
491
+ def timed_cache(ttl_seconds: int = 300) -> Callable:
492
+ """
493
+ Decorator for caching function results with TTL.
494
+
495
+ Args:
496
+ ttl_seconds: Time-to-live in seconds for cached results.
497
+
498
+ Example:
499
+ >>> @timed_cache(ttl_seconds=60)
500
+ ... def expensive_operation():
501
+ ... return fetch_data()
502
+ """
503
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
504
+ cache: Dict[str, tuple] = {}
505
+ lock = Lock()
506
+
507
+ @functools.wraps(func)
508
+ def wrapper(*args: Any, **kwargs: Any) -> T:
509
+ # Create cache key from args
510
+ key = str((args, sorted(kwargs.items())))
511
+
512
+ with lock:
513
+ if key in cache:
514
+ result, timestamp = cache[key]
515
+ if datetime.now() - timestamp < timedelta(seconds=ttl_seconds):
516
+ return result
517
+
518
+ # Call function and cache result
519
+ result = func(*args, **kwargs)
520
+ cache[key] = (result, datetime.now())
521
+ return result
522
+
523
+ # Add method to clear cache
524
+ def clear_cache() -> None:
525
+ with lock:
526
+ cache.clear()
527
+
528
+ wrapper.clear_cache = clear_cache # type: ignore
529
+ return wrapper
530
+
531
+ return decorator
532
+
533
+
534
+ class DataFrameExporter:
535
+ """
536
+ Utility class for exporting Collibra data to pandas DataFrames.
537
+
538
+ Provides methods for converting API responses and asset profiles
539
+ to pandas DataFrames for analysis and export.
540
+
541
+ Note: Requires pandas to be installed (`pip install pandas`).
542
+
543
+ Example:
544
+ >>> from collibra_connector import CollibraConnector, DataFrameExporter
545
+ >>> connector = CollibraConnector(...)
546
+ >>> exporter = DataFrameExporter(connector)
547
+ >>> df = exporter.assets_to_dataframe(domain_id="domain-uuid")
548
+ >>> df.to_csv("assets.csv")
549
+ """
550
+
551
+ def __init__(self, connector: "CollibraConnector") -> None:
552
+ """
553
+ Initialize the DataFrame exporter.
554
+
555
+ Args:
556
+ connector: The CollibraConnector instance.
557
+ """
558
+ self.connector = connector
559
+ self._pandas = None
560
+
561
+ def _get_pandas(self) -> Any:
562
+ """Lazy load pandas to avoid import errors if not installed."""
563
+ if self._pandas is None:
564
+ try:
565
+ import pandas as pd
566
+ self._pandas = pd
567
+ except ImportError:
568
+ raise ImportError(
569
+ "pandas is required for DataFrame export. "
570
+ "Install it with: pip install pandas"
571
+ )
572
+ return self._pandas
573
+
574
+ def assets_to_dataframe(
575
+ self,
576
+ domain_id: Optional[str] = None,
577
+ community_id: Optional[str] = None,
578
+ asset_type_ids: Optional[List[str]] = None,
579
+ limit: int = 1000,
580
+ include_attributes: bool = True,
581
+ include_relations: bool = False
582
+ ) -> Any:
583
+ """
584
+ Export assets to a pandas DataFrame.
585
+
586
+ Args:
587
+ domain_id: Filter by domain ID.
588
+ community_id: Filter by community ID.
589
+ asset_type_ids: Filter by asset type IDs.
590
+ limit: Maximum number of assets to fetch.
591
+ include_attributes: Include asset attributes as columns.
592
+ include_relations: Include relation summaries (slower).
593
+
594
+ Returns:
595
+ pandas DataFrame with asset data.
596
+
597
+ Example:
598
+ >>> df = exporter.assets_to_dataframe(domain_id="uuid", limit=500)
599
+ >>> print(df.columns)
600
+ >>> df.to_excel("assets.xlsx")
601
+ """
602
+ pd = self._get_pandas()
603
+
604
+ # Fetch assets
605
+ assets_result = self.connector.asset.find_assets(
606
+ domain_id=domain_id,
607
+ community_id=community_id,
608
+ asset_type_ids=asset_type_ids,
609
+ limit=limit
610
+ )
611
+
612
+ records = []
613
+ for asset in assets_result.get("results", []):
614
+ record = {
615
+ "id": asset.get("id"),
616
+ "name": asset.get("name"),
617
+ "display_name": asset.get("displayName"),
618
+ "type": asset.get("type", {}).get("name"),
619
+ "status": asset.get("status", {}).get("name"),
620
+ "domain": asset.get("domain", {}).get("name"),
621
+ "created_on": asset.get("createdOn"),
622
+ "last_modified_on": asset.get("lastModifiedOn"),
623
+ }
624
+
625
+ # Add attributes
626
+ if include_attributes:
627
+ try:
628
+ attrs = self.connector.attribute.get_attributes_as_dict(asset["id"])
629
+ for attr_name, attr_value in attrs.items():
630
+ col_name = f"attr_{attr_name.lower().replace(' ', '_')}"
631
+ # Clean HTML
632
+ if isinstance(attr_value, str) and '<' in attr_value:
633
+ import re
634
+ attr_value = re.sub(r'<[^>]+>', '', attr_value)
635
+ record[col_name] = attr_value
636
+ except Exception:
637
+ pass
638
+
639
+ # Add relations summary
640
+ if include_relations:
641
+ try:
642
+ relations = self.connector.relation.get_asset_relations(
643
+ asset["id"],
644
+ include_type_details=True
645
+ )
646
+ record["relations_outgoing"] = relations.get("outgoing_count", 0)
647
+ record["relations_incoming"] = relations.get("incoming_count", 0)
648
+ except Exception:
649
+ pass
650
+
651
+ records.append(record)
652
+
653
+ return pd.DataFrame(records)
654
+
655
+ def profiles_to_dataframe(
656
+ self,
657
+ asset_ids: List[str],
658
+ progress_callback: Optional[Callable[[int, int], None]] = None
659
+ ) -> Any:
660
+ """
661
+ Export multiple asset profiles to a pandas DataFrame.
662
+
663
+ Uses get_full_profile_flat() to get comprehensive data for each asset.
664
+
665
+ Args:
666
+ asset_ids: List of asset UUIDs to export.
667
+ progress_callback: Optional callback(current, total) for progress updates.
668
+
669
+ Returns:
670
+ pandas DataFrame with flattened profile data.
671
+
672
+ Example:
673
+ >>> asset_ids = ["uuid1", "uuid2", "uuid3"]
674
+ >>> df = exporter.profiles_to_dataframe(asset_ids)
675
+ >>> df.to_csv("profiles.csv")
676
+ """
677
+ pd = self._get_pandas()
678
+
679
+ records = []
680
+ total = len(asset_ids)
681
+
682
+ for i, asset_id in enumerate(asset_ids):
683
+ try:
684
+ flat_profile = self.connector.asset.get_full_profile_flat(asset_id)
685
+ records.append(flat_profile)
686
+ except Exception as e:
687
+ # Include partial record with error
688
+ records.append({
689
+ "id": asset_id,
690
+ "error": str(e)
691
+ })
692
+
693
+ if progress_callback:
694
+ progress_callback(i + 1, total)
695
+
696
+ return pd.DataFrame(records)
697
+
698
+ def communities_to_dataframe(self, limit: int = 1000) -> Any:
699
+ """
700
+ Export communities to a pandas DataFrame.
701
+
702
+ Args:
703
+ limit: Maximum number of communities to fetch.
704
+
705
+ Returns:
706
+ pandas DataFrame with community data.
707
+ """
708
+ pd = self._get_pandas()
709
+
710
+ result = self.connector.community.find_communities(limit=limit)
711
+
712
+ records = []
713
+ for comm in result.get("results", []):
714
+ records.append({
715
+ "id": comm.get("id"),
716
+ "name": comm.get("name"),
717
+ "description": comm.get("description"),
718
+ "parent_id": comm.get("parent", {}).get("id") if comm.get("parent") else None,
719
+ "parent_name": comm.get("parent", {}).get("name") if comm.get("parent") else None,
720
+ "created_on": comm.get("createdOn"),
721
+ })
722
+
723
+ return pd.DataFrame(records)
724
+
725
+ def domains_to_dataframe(
726
+ self,
727
+ community_id: Optional[str] = None,
728
+ limit: int = 1000
729
+ ) -> Any:
730
+ """
731
+ Export domains to a pandas DataFrame.
732
+
733
+ Args:
734
+ community_id: Filter by community ID.
735
+ limit: Maximum number of domains to fetch.
736
+
737
+ Returns:
738
+ pandas DataFrame with domain data.
739
+ """
740
+ pd = self._get_pandas()
741
+
742
+ result = self.connector.domain.find_domains(
743
+ community_id=community_id,
744
+ limit=limit
745
+ )
746
+
747
+ records = []
748
+ for domain in result.get("results", []):
749
+ records.append({
750
+ "id": domain.get("id"),
751
+ "name": domain.get("name"),
752
+ "description": domain.get("description"),
753
+ "type": domain.get("type", {}).get("name"),
754
+ "community_id": domain.get("community", {}).get("id"),
755
+ "community_name": domain.get("community", {}).get("name"),
756
+ "created_on": domain.get("createdOn"),
757
+ })
758
+
759
+ return pd.DataFrame(records)
760
+
761
+
762
+ class DataTransformer:
763
+ """
764
+ Utility class for transforming Collibra data structures.
765
+
766
+ Provides methods for flattening nested responses, converting
767
+ between formats, and extracting specific fields.
768
+ """
769
+
770
+ @staticmethod
771
+ def flatten_asset(asset: Dict[str, Any]) -> Dict[str, Any]:
772
+ """
773
+ Flatten a nested asset response into a flat dictionary.
774
+
775
+ Args:
776
+ asset: The asset dictionary from the API.
777
+
778
+ Returns:
779
+ Flattened dictionary with dot-notation keys.
780
+ """
781
+ flat = {}
782
+
783
+ def _flatten(obj: Any, prefix: str = "") -> None:
784
+ if isinstance(obj, dict):
785
+ for key, value in obj.items():
786
+ new_key = f"{prefix}.{key}" if prefix else key
787
+ if isinstance(value, (dict, list)):
788
+ _flatten(value, new_key)
789
+ else:
790
+ flat[new_key] = value
791
+ elif isinstance(obj, list):
792
+ for i, item in enumerate(obj):
793
+ _flatten(item, f"{prefix}[{i}]")
794
+ else:
795
+ flat[prefix] = obj
796
+
797
+ _flatten(asset)
798
+ return flat
799
+
800
+ @staticmethod
801
+ def extract_ids(items: List[Dict[str, Any]], key: str = "id") -> List[str]:
802
+ """
803
+ Extract a specific field from a list of dictionaries.
804
+
805
+ Args:
806
+ items: List of dictionaries.
807
+ key: The key to extract.
808
+
809
+ Returns:
810
+ List of extracted values.
811
+ """
812
+ return [item.get(key) for item in items if item.get(key)]
813
+
814
+ @staticmethod
815
+ def group_by(items: List[Dict[str, Any]], key: str) -> Dict[str, List[Dict[str, Any]]]:
816
+ """
817
+ Group items by a specific key value.
818
+
819
+ Args:
820
+ items: List of dictionaries.
821
+ key: The key to group by.
822
+
823
+ Returns:
824
+ Dictionary mapping key values to lists of items.
825
+ """
826
+ grouped: Dict[str, List[Dict[str, Any]]] = {}
827
+ for item in items:
828
+ value = item.get(key, "unknown")
829
+ if value not in grouped:
830
+ grouped[value] = []
831
+ grouped[value].append(item)
832
+ return grouped
833
+
834
+ @staticmethod
835
+ def to_name_id_map(items: List[Dict[str, Any]]) -> Dict[str, str]:
836
+ """
837
+ Convert a list of items to a name->id mapping.
838
+
839
+ Args:
840
+ items: List of dictionaries with 'name' and 'id' keys.
841
+
842
+ Returns:
843
+ Dictionary mapping names to IDs.
844
+ """
845
+ return {item["name"]: item["id"] for item in items if "name" in item and "id" in item}