collibra-connector 1.0.19__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- collibra_connector/__init__.py +284 -4
- collibra_connector/api/Asset.py +301 -3
- collibra_connector/api/Attribute.py +204 -0
- collibra_connector/api/Base.py +2 -2
- collibra_connector/api/Relation.py +216 -0
- collibra_connector/api/Responsibility.py +5 -5
- collibra_connector/api/Search.py +102 -0
- collibra_connector/api/Workflow.py +50 -16
- collibra_connector/api/__init__.py +23 -13
- collibra_connector/async_connector.py +930 -0
- collibra_connector/cli.py +597 -0
- collibra_connector/connector.py +270 -48
- collibra_connector/helpers.py +845 -0
- collibra_connector/lineage.py +716 -0
- collibra_connector/models.py +897 -0
- collibra_connector/py.typed +0 -0
- collibra_connector/telemetry.py +576 -0
- collibra_connector/testing.py +806 -0
- collibra_connector-1.1.1.dist-info/METADATA +540 -0
- collibra_connector-1.1.1.dist-info/RECORD +32 -0
- {collibra_connector-1.0.19.dist-info → collibra_connector-1.1.1.dist-info}/WHEEL +1 -1
- collibra_connector-1.1.1.dist-info/entry_points.txt +2 -0
- collibra_connector-1.0.19.dist-info/METADATA +0 -157
- collibra_connector-1.0.19.dist-info/RECORD +0 -21
- {collibra_connector-1.0.19.dist-info → collibra_connector-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {collibra_connector-1.0.19.dist-info → collibra_connector-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,845 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper utilities for the Collibra Connector.
|
|
3
|
+
|
|
4
|
+
This module provides utility classes and functions for:
|
|
5
|
+
- Pagination handling
|
|
6
|
+
- Batch operations
|
|
7
|
+
- Data transformations
|
|
8
|
+
- Caching
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
import functools
|
|
14
|
+
from typing import (
|
|
15
|
+
Any,
|
|
16
|
+
Callable,
|
|
17
|
+
Dict,
|
|
18
|
+
Generator,
|
|
19
|
+
Iterator,
|
|
20
|
+
List,
|
|
21
|
+
Optional,
|
|
22
|
+
TypeVar,
|
|
23
|
+
Union,
|
|
24
|
+
TYPE_CHECKING,
|
|
25
|
+
)
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from datetime import datetime, timedelta
|
|
28
|
+
from threading import Lock
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from .connector import CollibraConnector
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
T = TypeVar('T')
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class PaginatedResponse:
|
|
39
|
+
"""
|
|
40
|
+
Represents a paginated API response.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
results: List of items in the current page.
|
|
44
|
+
total: Total number of items available.
|
|
45
|
+
offset: Current offset in the result set.
|
|
46
|
+
limit: Number of items per page.
|
|
47
|
+
next_cursor: Cursor for the next page (if using cursor pagination).
|
|
48
|
+
"""
|
|
49
|
+
results: List[Dict[str, Any]]
|
|
50
|
+
total: int
|
|
51
|
+
offset: int = 0
|
|
52
|
+
limit: int = 0
|
|
53
|
+
next_cursor: Optional[str] = None
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_response(cls, response: Dict[str, Any]) -> "PaginatedResponse":
|
|
57
|
+
"""Create a PaginatedResponse from an API response dict."""
|
|
58
|
+
return cls(
|
|
59
|
+
results=response.get("results", []),
|
|
60
|
+
total=response.get("total", 0),
|
|
61
|
+
offset=response.get("offset", 0),
|
|
62
|
+
limit=response.get("limit", 0),
|
|
63
|
+
next_cursor=response.get("nextCursor"),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def has_more(self) -> bool:
|
|
67
|
+
"""Check if there are more pages available."""
|
|
68
|
+
if self.next_cursor:
|
|
69
|
+
return True
|
|
70
|
+
return self.offset + len(self.results) < self.total
|
|
71
|
+
|
|
72
|
+
def __len__(self) -> int:
|
|
73
|
+
"""Return the number of results in this page."""
|
|
74
|
+
return len(self.results)
|
|
75
|
+
|
|
76
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
77
|
+
"""Iterate over results in this page."""
|
|
78
|
+
return iter(self.results)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Paginator:
|
|
82
|
+
"""
|
|
83
|
+
Helper class for iterating over paginated API results.
|
|
84
|
+
|
|
85
|
+
Provides both page-by-page and item-by-item iteration over
|
|
86
|
+
large result sets without loading everything into memory.
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> paginator = Paginator(connector.asset.find_assets, limit=100)
|
|
90
|
+
>>> for asset in paginator.items():
|
|
91
|
+
... print(asset['name'])
|
|
92
|
+
|
|
93
|
+
>>> # Or iterate by pages
|
|
94
|
+
>>> for page in paginator.pages():
|
|
95
|
+
... print(f"Processing {len(page)} items")
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
fetch_func: Callable[..., Dict[str, Any]],
|
|
101
|
+
limit: int = 100,
|
|
102
|
+
max_items: Optional[int] = None,
|
|
103
|
+
use_cursor: bool = False,
|
|
104
|
+
**kwargs: Any
|
|
105
|
+
) -> None:
|
|
106
|
+
"""
|
|
107
|
+
Initialize the paginator.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
fetch_func: The API method to call for fetching results.
|
|
111
|
+
limit: Number of items to fetch per page.
|
|
112
|
+
max_items: Maximum total items to fetch (None for all).
|
|
113
|
+
use_cursor: Whether to use cursor-based pagination.
|
|
114
|
+
**kwargs: Additional arguments to pass to fetch_func.
|
|
115
|
+
"""
|
|
116
|
+
self.fetch_func = fetch_func
|
|
117
|
+
self.limit = limit
|
|
118
|
+
self.max_items = max_items
|
|
119
|
+
self.use_cursor = use_cursor
|
|
120
|
+
self.kwargs = kwargs
|
|
121
|
+
self._total_fetched = 0
|
|
122
|
+
|
|
123
|
+
def pages(self) -> Generator[PaginatedResponse, None, None]:
|
|
124
|
+
"""
|
|
125
|
+
Iterate over pages of results.
|
|
126
|
+
|
|
127
|
+
Yields:
|
|
128
|
+
PaginatedResponse objects for each page.
|
|
129
|
+
"""
|
|
130
|
+
offset = 0
|
|
131
|
+
cursor: Optional[str] = "" if self.use_cursor else None
|
|
132
|
+
|
|
133
|
+
while True:
|
|
134
|
+
# Check if we've reached max_items
|
|
135
|
+
if self.max_items and self._total_fetched >= self.max_items:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
# Calculate limit for this request
|
|
139
|
+
current_limit = self.limit
|
|
140
|
+
if self.max_items:
|
|
141
|
+
remaining = self.max_items - self._total_fetched
|
|
142
|
+
current_limit = min(self.limit, remaining)
|
|
143
|
+
|
|
144
|
+
# Build request parameters
|
|
145
|
+
params = {**self.kwargs, "limit": current_limit}
|
|
146
|
+
|
|
147
|
+
if self.use_cursor and cursor is not None:
|
|
148
|
+
params["cursor"] = cursor
|
|
149
|
+
else:
|
|
150
|
+
params["offset"] = offset
|
|
151
|
+
|
|
152
|
+
# Fetch the page
|
|
153
|
+
response = self.fetch_func(**params)
|
|
154
|
+
page = PaginatedResponse.from_response(response)
|
|
155
|
+
|
|
156
|
+
if not page.results:
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
self._total_fetched += len(page.results)
|
|
160
|
+
yield page
|
|
161
|
+
|
|
162
|
+
# Prepare for next iteration
|
|
163
|
+
if self.use_cursor:
|
|
164
|
+
cursor = page.next_cursor
|
|
165
|
+
if not cursor:
|
|
166
|
+
break
|
|
167
|
+
else:
|
|
168
|
+
offset += len(page.results)
|
|
169
|
+
if not page.has_more():
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
def items(self) -> Generator[Dict[str, Any], None, None]:
|
|
173
|
+
"""
|
|
174
|
+
Iterate over individual items across all pages.
|
|
175
|
+
|
|
176
|
+
Yields:
|
|
177
|
+
Individual result dictionaries.
|
|
178
|
+
"""
|
|
179
|
+
for page in self.pages():
|
|
180
|
+
yield from page.results
|
|
181
|
+
|
|
182
|
+
def collect(self) -> List[Dict[str, Any]]:
|
|
183
|
+
"""
|
|
184
|
+
Collect all items into a list.
|
|
185
|
+
|
|
186
|
+
Warning: This loads all results into memory. Use items()
|
|
187
|
+
for memory-efficient iteration over large datasets.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
List of all result dictionaries.
|
|
191
|
+
"""
|
|
192
|
+
return list(self.items())
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def total_fetched(self) -> int:
|
|
196
|
+
"""Get the total number of items fetched so far."""
|
|
197
|
+
return self._total_fetched
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class BatchProcessor:
|
|
201
|
+
"""
|
|
202
|
+
Helper class for processing items in batches.
|
|
203
|
+
|
|
204
|
+
Useful for bulk operations like creating or updating multiple
|
|
205
|
+
assets, while respecting API rate limits.
|
|
206
|
+
|
|
207
|
+
Example:
|
|
208
|
+
>>> processor = BatchProcessor(batch_size=50, delay=0.5)
|
|
209
|
+
>>> results = processor.process(
|
|
210
|
+
... items=assets_to_create,
|
|
211
|
+
... operation=connector.asset.add_asset,
|
|
212
|
+
... item_mapper=lambda a: {'name': a['name'], 'domain_id': a['domain']}
|
|
213
|
+
... )
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(
|
|
217
|
+
self,
|
|
218
|
+
batch_size: int = 50,
|
|
219
|
+
delay: float = 0.1,
|
|
220
|
+
on_error: str = "continue"
|
|
221
|
+
) -> None:
|
|
222
|
+
"""
|
|
223
|
+
Initialize the batch processor.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
batch_size: Number of items to process per batch.
|
|
227
|
+
delay: Delay in seconds between batches.
|
|
228
|
+
on_error: Error handling strategy: "continue", "stop", or "collect".
|
|
229
|
+
"""
|
|
230
|
+
self.batch_size = batch_size
|
|
231
|
+
self.delay = delay
|
|
232
|
+
self.on_error = on_error
|
|
233
|
+
|
|
234
|
+
def process(
|
|
235
|
+
self,
|
|
236
|
+
items: List[T],
|
|
237
|
+
operation: Callable[..., Any],
|
|
238
|
+
item_mapper: Optional[Callable[[T], Dict[str, Any]]] = None,
|
|
239
|
+
progress_callback: Optional[Callable[[int, int], None]] = None
|
|
240
|
+
) -> "BatchResult":
|
|
241
|
+
"""
|
|
242
|
+
Process items in batches.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
items: List of items to process.
|
|
246
|
+
operation: The API operation to call for each item.
|
|
247
|
+
item_mapper: Optional function to transform items into operation kwargs.
|
|
248
|
+
progress_callback: Optional callback(processed, total) for progress updates.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
BatchResult with success/failure information.
|
|
252
|
+
"""
|
|
253
|
+
result = BatchResult()
|
|
254
|
+
total = len(items)
|
|
255
|
+
|
|
256
|
+
for i, item in enumerate(items):
|
|
257
|
+
try:
|
|
258
|
+
# Transform item if mapper provided
|
|
259
|
+
kwargs = item_mapper(item) if item_mapper else item
|
|
260
|
+
|
|
261
|
+
# Execute operation
|
|
262
|
+
response = operation(**kwargs) if isinstance(kwargs, dict) else operation(kwargs)
|
|
263
|
+
result.add_success(item, response)
|
|
264
|
+
|
|
265
|
+
except Exception as e:
|
|
266
|
+
result.add_error(item, e)
|
|
267
|
+
|
|
268
|
+
if self.on_error == "stop":
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
# Progress callback
|
|
272
|
+
if progress_callback:
|
|
273
|
+
progress_callback(i + 1, total)
|
|
274
|
+
|
|
275
|
+
# Delay between batches
|
|
276
|
+
if (i + 1) % self.batch_size == 0 and i + 1 < total:
|
|
277
|
+
time.sleep(self.delay)
|
|
278
|
+
|
|
279
|
+
return result
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class BatchResult:
|
|
284
|
+
"""
|
|
285
|
+
Result of a batch processing operation.
|
|
286
|
+
|
|
287
|
+
Attributes:
|
|
288
|
+
successes: List of (item, response) tuples for successful operations.
|
|
289
|
+
errors: List of (item, exception) tuples for failed operations.
|
|
290
|
+
"""
|
|
291
|
+
successes: List[tuple] = field(default_factory=list)
|
|
292
|
+
errors: List[tuple] = field(default_factory=list)
|
|
293
|
+
|
|
294
|
+
def add_success(self, item: Any, response: Any) -> None:
|
|
295
|
+
"""Add a successful result."""
|
|
296
|
+
self.successes.append((item, response))
|
|
297
|
+
|
|
298
|
+
def add_error(self, item: Any, error: Exception) -> None:
|
|
299
|
+
"""Add an error result."""
|
|
300
|
+
self.errors.append((item, error))
|
|
301
|
+
|
|
302
|
+
@property
|
|
303
|
+
def success_count(self) -> int:
|
|
304
|
+
"""Get the number of successful operations."""
|
|
305
|
+
return len(self.successes)
|
|
306
|
+
|
|
307
|
+
@property
|
|
308
|
+
def error_count(self) -> int:
|
|
309
|
+
"""Get the number of failed operations."""
|
|
310
|
+
return len(self.errors)
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def total_count(self) -> int:
|
|
314
|
+
"""Get the total number of operations."""
|
|
315
|
+
return self.success_count + self.error_count
|
|
316
|
+
|
|
317
|
+
@property
|
|
318
|
+
def success_rate(self) -> float:
|
|
319
|
+
"""Get the success rate as a percentage."""
|
|
320
|
+
if self.total_count == 0:
|
|
321
|
+
return 0.0
|
|
322
|
+
return (self.success_count / self.total_count) * 100
|
|
323
|
+
|
|
324
|
+
def __repr__(self) -> str:
|
|
325
|
+
return f"BatchResult(successes={self.success_count}, errors={self.error_count})"
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class CachedMetadata:
|
|
329
|
+
"""
|
|
330
|
+
Thread-safe cache for Collibra metadata like UUIDs.
|
|
331
|
+
|
|
332
|
+
Caches metadata to avoid repeated API calls for frequently
|
|
333
|
+
accessed data like asset types, statuses, and attributes.
|
|
334
|
+
|
|
335
|
+
Example:
|
|
336
|
+
>>> cache = CachedMetadata(connector, ttl=3600)
|
|
337
|
+
>>> asset_type_id = cache.get_asset_type_id("Business Term")
|
|
338
|
+
>>> status_id = cache.get_status_id("Approved")
|
|
339
|
+
"""
|
|
340
|
+
|
|
341
|
+
def __init__(
|
|
342
|
+
self,
|
|
343
|
+
connector: "CollibraConnector",
|
|
344
|
+
ttl: int = 3600
|
|
345
|
+
) -> None:
|
|
346
|
+
"""
|
|
347
|
+
Initialize the metadata cache.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
connector: The CollibraConnector instance.
|
|
351
|
+
ttl: Time-to-live in seconds for cached data.
|
|
352
|
+
"""
|
|
353
|
+
self.connector = connector
|
|
354
|
+
self.ttl = ttl
|
|
355
|
+
self._cache: Dict[str, Dict[str, Any]] = {}
|
|
356
|
+
self._timestamps: Dict[str, datetime] = {}
|
|
357
|
+
self._lock = Lock()
|
|
358
|
+
|
|
359
|
+
def _is_expired(self, key: str) -> bool:
|
|
360
|
+
"""Check if a cache entry has expired."""
|
|
361
|
+
if key not in self._timestamps:
|
|
362
|
+
return True
|
|
363
|
+
return datetime.now() - self._timestamps[key] > timedelta(seconds=self.ttl)
|
|
364
|
+
|
|
365
|
+
def _refresh_if_needed(self, category: str) -> None:
|
|
366
|
+
"""Refresh cache for a category if expired."""
|
|
367
|
+
with self._lock:
|
|
368
|
+
if self._is_expired(category):
|
|
369
|
+
self._refresh_category(category)
|
|
370
|
+
|
|
371
|
+
def _refresh_category(self, category: str) -> None:
|
|
372
|
+
"""Refresh a specific category of metadata."""
|
|
373
|
+
try:
|
|
374
|
+
if category == "asset_types":
|
|
375
|
+
data = self._fetch_all_pages(
|
|
376
|
+
self.connector.metadata.get_asset_types
|
|
377
|
+
if hasattr(self.connector.metadata, 'get_asset_types')
|
|
378
|
+
else lambda **kw: self._get_via_base_api("/assetTypes", **kw)
|
|
379
|
+
)
|
|
380
|
+
self._cache["asset_types"] = {item["name"]: item["id"] for item in data}
|
|
381
|
+
|
|
382
|
+
elif category == "statuses":
|
|
383
|
+
data = self._fetch_all_pages(
|
|
384
|
+
lambda **kw: self._get_via_base_api("/statuses", **kw)
|
|
385
|
+
)
|
|
386
|
+
self._cache["statuses"] = {item["name"]: item["id"] for item in data}
|
|
387
|
+
|
|
388
|
+
elif category == "attribute_types":
|
|
389
|
+
data = self._fetch_all_pages(
|
|
390
|
+
lambda **kw: self._get_via_base_api("/attributeTypes", **kw)
|
|
391
|
+
)
|
|
392
|
+
self._cache["attribute_types"] = {item["name"]: item["id"] for item in data}
|
|
393
|
+
|
|
394
|
+
elif category == "domain_types":
|
|
395
|
+
data = self._fetch_all_pages(
|
|
396
|
+
lambda **kw: self._get_via_base_api("/domainTypes", **kw)
|
|
397
|
+
)
|
|
398
|
+
self._cache["domain_types"] = {item["name"]: item["id"] for item in data}
|
|
399
|
+
|
|
400
|
+
elif category == "relation_types":
|
|
401
|
+
data = self._fetch_all_pages(
|
|
402
|
+
lambda **kw: self._get_via_base_api("/relationTypes", **kw)
|
|
403
|
+
)
|
|
404
|
+
self._cache["relation_types"] = {
|
|
405
|
+
f"{item['sourceType']['name']}_{item['targetType']['name']}": item["id"]
|
|
406
|
+
for item in data
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
elif category == "roles":
|
|
410
|
+
data = self._fetch_all_pages(
|
|
411
|
+
lambda **kw: self._get_via_base_api("/roles", **kw)
|
|
412
|
+
)
|
|
413
|
+
self._cache["roles"] = {item["name"]: item["id"] for item in data}
|
|
414
|
+
|
|
415
|
+
self._timestamps[category] = datetime.now()
|
|
416
|
+
|
|
417
|
+
except Exception:
|
|
418
|
+
# On error, set empty cache to avoid repeated failures
|
|
419
|
+
self._cache[category] = {}
|
|
420
|
+
self._timestamps[category] = datetime.now()
|
|
421
|
+
|
|
422
|
+
def _get_via_base_api(self, endpoint: str, **kwargs: Any) -> Dict[str, Any]:
|
|
423
|
+
"""Make a request via the base API."""
|
|
424
|
+
import requests
|
|
425
|
+
url = f"{self.connector.api}{endpoint}"
|
|
426
|
+
response = requests.get(
|
|
427
|
+
url,
|
|
428
|
+
auth=self.connector.auth,
|
|
429
|
+
params=kwargs,
|
|
430
|
+
timeout=self.connector.timeout
|
|
431
|
+
)
|
|
432
|
+
response.raise_for_status()
|
|
433
|
+
return response.json()
|
|
434
|
+
|
|
435
|
+
def _fetch_all_pages(self, fetch_func: Callable[..., Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
436
|
+
"""Fetch all pages of a paginated endpoint."""
|
|
437
|
+
all_results = []
|
|
438
|
+
offset = 0
|
|
439
|
+
limit = 1000
|
|
440
|
+
|
|
441
|
+
while True:
|
|
442
|
+
response = fetch_func(offset=offset, limit=limit)
|
|
443
|
+
results = response.get("results", [])
|
|
444
|
+
all_results.extend(results)
|
|
445
|
+
|
|
446
|
+
if len(results) < limit:
|
|
447
|
+
break
|
|
448
|
+
offset += limit
|
|
449
|
+
|
|
450
|
+
return all_results
|
|
451
|
+
|
|
452
|
+
def get_asset_type_id(self, name: str) -> Optional[str]:
|
|
453
|
+
"""Get asset type UUID by name."""
|
|
454
|
+
self._refresh_if_needed("asset_types")
|
|
455
|
+
return self._cache.get("asset_types", {}).get(name)
|
|
456
|
+
|
|
457
|
+
def get_status_id(self, name: str) -> Optional[str]:
|
|
458
|
+
"""Get status UUID by name."""
|
|
459
|
+
self._refresh_if_needed("statuses")
|
|
460
|
+
return self._cache.get("statuses", {}).get(name)
|
|
461
|
+
|
|
462
|
+
def get_attribute_type_id(self, name: str) -> Optional[str]:
|
|
463
|
+
"""Get attribute type UUID by name."""
|
|
464
|
+
self._refresh_if_needed("attribute_types")
|
|
465
|
+
return self._cache.get("attribute_types", {}).get(name)
|
|
466
|
+
|
|
467
|
+
def get_domain_type_id(self, name: str) -> Optional[str]:
|
|
468
|
+
"""Get domain type UUID by name."""
|
|
469
|
+
self._refresh_if_needed("domain_types")
|
|
470
|
+
return self._cache.get("domain_types", {}).get(name)
|
|
471
|
+
|
|
472
|
+
def get_role_id(self, name: str) -> Optional[str]:
|
|
473
|
+
"""Get role UUID by name."""
|
|
474
|
+
self._refresh_if_needed("roles")
|
|
475
|
+
return self._cache.get("roles", {}).get(name)
|
|
476
|
+
|
|
477
|
+
def clear(self) -> None:
|
|
478
|
+
"""Clear all cached data."""
|
|
479
|
+
with self._lock:
|
|
480
|
+
self._cache.clear()
|
|
481
|
+
self._timestamps.clear()
|
|
482
|
+
|
|
483
|
+
def refresh_all(self) -> None:
|
|
484
|
+
"""Force refresh of all cached data."""
|
|
485
|
+
categories = ["asset_types", "statuses", "attribute_types",
|
|
486
|
+
"domain_types", "relation_types", "roles"]
|
|
487
|
+
for category in categories:
|
|
488
|
+
self._refresh_category(category)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def timed_cache(ttl_seconds: int = 300) -> Callable:
|
|
492
|
+
"""
|
|
493
|
+
Decorator for caching function results with TTL.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
ttl_seconds: Time-to-live in seconds for cached results.
|
|
497
|
+
|
|
498
|
+
Example:
|
|
499
|
+
>>> @timed_cache(ttl_seconds=60)
|
|
500
|
+
... def expensive_operation():
|
|
501
|
+
... return fetch_data()
|
|
502
|
+
"""
|
|
503
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
504
|
+
cache: Dict[str, tuple] = {}
|
|
505
|
+
lock = Lock()
|
|
506
|
+
|
|
507
|
+
@functools.wraps(func)
|
|
508
|
+
def wrapper(*args: Any, **kwargs: Any) -> T:
|
|
509
|
+
# Create cache key from args
|
|
510
|
+
key = str((args, sorted(kwargs.items())))
|
|
511
|
+
|
|
512
|
+
with lock:
|
|
513
|
+
if key in cache:
|
|
514
|
+
result, timestamp = cache[key]
|
|
515
|
+
if datetime.now() - timestamp < timedelta(seconds=ttl_seconds):
|
|
516
|
+
return result
|
|
517
|
+
|
|
518
|
+
# Call function and cache result
|
|
519
|
+
result = func(*args, **kwargs)
|
|
520
|
+
cache[key] = (result, datetime.now())
|
|
521
|
+
return result
|
|
522
|
+
|
|
523
|
+
# Add method to clear cache
|
|
524
|
+
def clear_cache() -> None:
|
|
525
|
+
with lock:
|
|
526
|
+
cache.clear()
|
|
527
|
+
|
|
528
|
+
wrapper.clear_cache = clear_cache # type: ignore
|
|
529
|
+
return wrapper
|
|
530
|
+
|
|
531
|
+
return decorator
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class DataFrameExporter:
|
|
535
|
+
"""
|
|
536
|
+
Utility class for exporting Collibra data to pandas DataFrames.
|
|
537
|
+
|
|
538
|
+
Provides methods for converting API responses and asset profiles
|
|
539
|
+
to pandas DataFrames for analysis and export.
|
|
540
|
+
|
|
541
|
+
Note: Requires pandas to be installed (`pip install pandas`).
|
|
542
|
+
|
|
543
|
+
Example:
|
|
544
|
+
>>> from collibra_connector import CollibraConnector, DataFrameExporter
|
|
545
|
+
>>> connector = CollibraConnector(...)
|
|
546
|
+
>>> exporter = DataFrameExporter(connector)
|
|
547
|
+
>>> df = exporter.assets_to_dataframe(domain_id="domain-uuid")
|
|
548
|
+
>>> df.to_csv("assets.csv")
|
|
549
|
+
"""
|
|
550
|
+
|
|
551
|
+
def __init__(self, connector: "CollibraConnector") -> None:
|
|
552
|
+
"""
|
|
553
|
+
Initialize the DataFrame exporter.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
connector: The CollibraConnector instance.
|
|
557
|
+
"""
|
|
558
|
+
self.connector = connector
|
|
559
|
+
self._pandas = None
|
|
560
|
+
|
|
561
|
+
def _get_pandas(self) -> Any:
|
|
562
|
+
"""Lazy load pandas to avoid import errors if not installed."""
|
|
563
|
+
if self._pandas is None:
|
|
564
|
+
try:
|
|
565
|
+
import pandas as pd
|
|
566
|
+
self._pandas = pd
|
|
567
|
+
except ImportError:
|
|
568
|
+
raise ImportError(
|
|
569
|
+
"pandas is required for DataFrame export. "
|
|
570
|
+
"Install it with: pip install pandas"
|
|
571
|
+
)
|
|
572
|
+
return self._pandas
|
|
573
|
+
|
|
574
|
+
def assets_to_dataframe(
|
|
575
|
+
self,
|
|
576
|
+
domain_id: Optional[str] = None,
|
|
577
|
+
community_id: Optional[str] = None,
|
|
578
|
+
asset_type_ids: Optional[List[str]] = None,
|
|
579
|
+
limit: int = 1000,
|
|
580
|
+
include_attributes: bool = True,
|
|
581
|
+
include_relations: bool = False
|
|
582
|
+
) -> Any:
|
|
583
|
+
"""
|
|
584
|
+
Export assets to a pandas DataFrame.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
domain_id: Filter by domain ID.
|
|
588
|
+
community_id: Filter by community ID.
|
|
589
|
+
asset_type_ids: Filter by asset type IDs.
|
|
590
|
+
limit: Maximum number of assets to fetch.
|
|
591
|
+
include_attributes: Include asset attributes as columns.
|
|
592
|
+
include_relations: Include relation summaries (slower).
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
pandas DataFrame with asset data.
|
|
596
|
+
|
|
597
|
+
Example:
|
|
598
|
+
>>> df = exporter.assets_to_dataframe(domain_id="uuid", limit=500)
|
|
599
|
+
>>> print(df.columns)
|
|
600
|
+
>>> df.to_excel("assets.xlsx")
|
|
601
|
+
"""
|
|
602
|
+
pd = self._get_pandas()
|
|
603
|
+
|
|
604
|
+
# Fetch assets
|
|
605
|
+
assets_result = self.connector.asset.find_assets(
|
|
606
|
+
domain_id=domain_id,
|
|
607
|
+
community_id=community_id,
|
|
608
|
+
asset_type_ids=asset_type_ids,
|
|
609
|
+
limit=limit
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
records = []
|
|
613
|
+
for asset in assets_result.get("results", []):
|
|
614
|
+
record = {
|
|
615
|
+
"id": asset.get("id"),
|
|
616
|
+
"name": asset.get("name"),
|
|
617
|
+
"display_name": asset.get("displayName"),
|
|
618
|
+
"type": asset.get("type", {}).get("name"),
|
|
619
|
+
"status": asset.get("status", {}).get("name"),
|
|
620
|
+
"domain": asset.get("domain", {}).get("name"),
|
|
621
|
+
"created_on": asset.get("createdOn"),
|
|
622
|
+
"last_modified_on": asset.get("lastModifiedOn"),
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
# Add attributes
|
|
626
|
+
if include_attributes:
|
|
627
|
+
try:
|
|
628
|
+
attrs = self.connector.attribute.get_attributes_as_dict(asset["id"])
|
|
629
|
+
for attr_name, attr_value in attrs.items():
|
|
630
|
+
col_name = f"attr_{attr_name.lower().replace(' ', '_')}"
|
|
631
|
+
# Clean HTML
|
|
632
|
+
if isinstance(attr_value, str) and '<' in attr_value:
|
|
633
|
+
import re
|
|
634
|
+
attr_value = re.sub(r'<[^>]+>', '', attr_value)
|
|
635
|
+
record[col_name] = attr_value
|
|
636
|
+
except Exception:
|
|
637
|
+
pass
|
|
638
|
+
|
|
639
|
+
# Add relations summary
|
|
640
|
+
if include_relations:
|
|
641
|
+
try:
|
|
642
|
+
relations = self.connector.relation.get_asset_relations(
|
|
643
|
+
asset["id"],
|
|
644
|
+
include_type_details=True
|
|
645
|
+
)
|
|
646
|
+
record["relations_outgoing"] = relations.get("outgoing_count", 0)
|
|
647
|
+
record["relations_incoming"] = relations.get("incoming_count", 0)
|
|
648
|
+
except Exception:
|
|
649
|
+
pass
|
|
650
|
+
|
|
651
|
+
records.append(record)
|
|
652
|
+
|
|
653
|
+
return pd.DataFrame(records)
|
|
654
|
+
|
|
655
|
+
def profiles_to_dataframe(
|
|
656
|
+
self,
|
|
657
|
+
asset_ids: List[str],
|
|
658
|
+
progress_callback: Optional[Callable[[int, int], None]] = None
|
|
659
|
+
) -> Any:
|
|
660
|
+
"""
|
|
661
|
+
Export multiple asset profiles to a pandas DataFrame.
|
|
662
|
+
|
|
663
|
+
Uses get_full_profile_flat() to get comprehensive data for each asset.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
asset_ids: List of asset UUIDs to export.
|
|
667
|
+
progress_callback: Optional callback(current, total) for progress updates.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
pandas DataFrame with flattened profile data.
|
|
671
|
+
|
|
672
|
+
Example:
|
|
673
|
+
>>> asset_ids = ["uuid1", "uuid2", "uuid3"]
|
|
674
|
+
>>> df = exporter.profiles_to_dataframe(asset_ids)
|
|
675
|
+
>>> df.to_csv("profiles.csv")
|
|
676
|
+
"""
|
|
677
|
+
pd = self._get_pandas()
|
|
678
|
+
|
|
679
|
+
records = []
|
|
680
|
+
total = len(asset_ids)
|
|
681
|
+
|
|
682
|
+
for i, asset_id in enumerate(asset_ids):
|
|
683
|
+
try:
|
|
684
|
+
flat_profile = self.connector.asset.get_full_profile_flat(asset_id)
|
|
685
|
+
records.append(flat_profile)
|
|
686
|
+
except Exception as e:
|
|
687
|
+
# Include partial record with error
|
|
688
|
+
records.append({
|
|
689
|
+
"id": asset_id,
|
|
690
|
+
"error": str(e)
|
|
691
|
+
})
|
|
692
|
+
|
|
693
|
+
if progress_callback:
|
|
694
|
+
progress_callback(i + 1, total)
|
|
695
|
+
|
|
696
|
+
return pd.DataFrame(records)
|
|
697
|
+
|
|
698
|
+
def communities_to_dataframe(self, limit: int = 1000) -> Any:
|
|
699
|
+
"""
|
|
700
|
+
Export communities to a pandas DataFrame.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
limit: Maximum number of communities to fetch.
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
pandas DataFrame with community data.
|
|
707
|
+
"""
|
|
708
|
+
pd = self._get_pandas()
|
|
709
|
+
|
|
710
|
+
result = self.connector.community.find_communities(limit=limit)
|
|
711
|
+
|
|
712
|
+
records = []
|
|
713
|
+
for comm in result.get("results", []):
|
|
714
|
+
records.append({
|
|
715
|
+
"id": comm.get("id"),
|
|
716
|
+
"name": comm.get("name"),
|
|
717
|
+
"description": comm.get("description"),
|
|
718
|
+
"parent_id": comm.get("parent", {}).get("id") if comm.get("parent") else None,
|
|
719
|
+
"parent_name": comm.get("parent", {}).get("name") if comm.get("parent") else None,
|
|
720
|
+
"created_on": comm.get("createdOn"),
|
|
721
|
+
})
|
|
722
|
+
|
|
723
|
+
return pd.DataFrame(records)
|
|
724
|
+
|
|
725
|
+
def domains_to_dataframe(
|
|
726
|
+
self,
|
|
727
|
+
community_id: Optional[str] = None,
|
|
728
|
+
limit: int = 1000
|
|
729
|
+
) -> Any:
|
|
730
|
+
"""
|
|
731
|
+
Export domains to a pandas DataFrame.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
community_id: Filter by community ID.
|
|
735
|
+
limit: Maximum number of domains to fetch.
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
pandas DataFrame with domain data.
|
|
739
|
+
"""
|
|
740
|
+
pd = self._get_pandas()
|
|
741
|
+
|
|
742
|
+
result = self.connector.domain.find_domains(
|
|
743
|
+
community_id=community_id,
|
|
744
|
+
limit=limit
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
records = []
|
|
748
|
+
for domain in result.get("results", []):
|
|
749
|
+
records.append({
|
|
750
|
+
"id": domain.get("id"),
|
|
751
|
+
"name": domain.get("name"),
|
|
752
|
+
"description": domain.get("description"),
|
|
753
|
+
"type": domain.get("type", {}).get("name"),
|
|
754
|
+
"community_id": domain.get("community", {}).get("id"),
|
|
755
|
+
"community_name": domain.get("community", {}).get("name"),
|
|
756
|
+
"created_on": domain.get("createdOn"),
|
|
757
|
+
})
|
|
758
|
+
|
|
759
|
+
return pd.DataFrame(records)
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
class DataTransformer:
|
|
763
|
+
"""
|
|
764
|
+
Utility class for transforming Collibra data structures.
|
|
765
|
+
|
|
766
|
+
Provides methods for flattening nested responses, converting
|
|
767
|
+
between formats, and extracting specific fields.
|
|
768
|
+
"""
|
|
769
|
+
|
|
770
|
+
@staticmethod
|
|
771
|
+
def flatten_asset(asset: Dict[str, Any]) -> Dict[str, Any]:
|
|
772
|
+
"""
|
|
773
|
+
Flatten a nested asset response into a flat dictionary.
|
|
774
|
+
|
|
775
|
+
Args:
|
|
776
|
+
asset: The asset dictionary from the API.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
Flattened dictionary with dot-notation keys.
|
|
780
|
+
"""
|
|
781
|
+
flat = {}
|
|
782
|
+
|
|
783
|
+
def _flatten(obj: Any, prefix: str = "") -> None:
|
|
784
|
+
if isinstance(obj, dict):
|
|
785
|
+
for key, value in obj.items():
|
|
786
|
+
new_key = f"{prefix}.{key}" if prefix else key
|
|
787
|
+
if isinstance(value, (dict, list)):
|
|
788
|
+
_flatten(value, new_key)
|
|
789
|
+
else:
|
|
790
|
+
flat[new_key] = value
|
|
791
|
+
elif isinstance(obj, list):
|
|
792
|
+
for i, item in enumerate(obj):
|
|
793
|
+
_flatten(item, f"{prefix}[{i}]")
|
|
794
|
+
else:
|
|
795
|
+
flat[prefix] = obj
|
|
796
|
+
|
|
797
|
+
_flatten(asset)
|
|
798
|
+
return flat
|
|
799
|
+
|
|
800
|
+
@staticmethod
|
|
801
|
+
def extract_ids(items: List[Dict[str, Any]], key: str = "id") -> List[str]:
|
|
802
|
+
"""
|
|
803
|
+
Extract a specific field from a list of dictionaries.
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
items: List of dictionaries.
|
|
807
|
+
key: The key to extract.
|
|
808
|
+
|
|
809
|
+
Returns:
|
|
810
|
+
List of extracted values.
|
|
811
|
+
"""
|
|
812
|
+
return [item.get(key) for item in items if item.get(key)]
|
|
813
|
+
|
|
814
|
+
@staticmethod
|
|
815
|
+
def group_by(items: List[Dict[str, Any]], key: str) -> Dict[str, List[Dict[str, Any]]]:
|
|
816
|
+
"""
|
|
817
|
+
Group items by a specific key value.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
items: List of dictionaries.
|
|
821
|
+
key: The key to group by.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
Dictionary mapping key values to lists of items.
|
|
825
|
+
"""
|
|
826
|
+
grouped: Dict[str, List[Dict[str, Any]]] = {}
|
|
827
|
+
for item in items:
|
|
828
|
+
value = item.get(key, "unknown")
|
|
829
|
+
if value not in grouped:
|
|
830
|
+
grouped[value] = []
|
|
831
|
+
grouped[value].append(item)
|
|
832
|
+
return grouped
|
|
833
|
+
|
|
834
|
+
@staticmethod
|
|
835
|
+
def to_name_id_map(items: List[Dict[str, Any]]) -> Dict[str, str]:
|
|
836
|
+
"""
|
|
837
|
+
Convert a list of items to a name->id mapping.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
items: List of dictionaries with 'name' and 'id' keys.
|
|
841
|
+
|
|
842
|
+
Returns:
|
|
843
|
+
Dictionary mapping names to IDs.
|
|
844
|
+
"""
|
|
845
|
+
return {item["name"]: item["id"] for item in items if "name" in item and "id" in item}
|